shithub: openh264

Download patch

ref: 57f6bcc4b0da529101c25fd97349e9e55a6a5cee
parent: faaf62afadeedc01a89a482ab56ec23027b6c3ba
author: Martin Storsjö <[email protected]>
date: Sat May 31 10:13:34 EDT 2014

Convert all tabs to spaces in assembly sources, unify indentation

Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.

--- a/codec/common/arm/copy_mb_neon.S
+++ b/codec/common/arm/copy_mb_neon.S
@@ -36,75 +36,75 @@
 
 #ifdef __APPLE__
 .macro LOAD_ALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, src*, src_stride
-    vld1.64	{$0}, [$4,:128], $5
-    vld1.64	{$1}, [$4,:128], $5
-    vld1.64	{$2}, [$4,:128], $5
-    vld1.64	{$3}, [$4,:128], $5
-//	}
+//  {   //  input: $0~$3, src*, src_stride
+    vld1.64 {$0}, [$4,:128], $5
+    vld1.64 {$1}, [$4,:128], $5
+    vld1.64 {$2}, [$4,:128], $5
+    vld1.64 {$3}, [$4,:128], $5
+//  }
 .endm
 
 .macro STORE_ALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, dst*, dst_stride
-    vst1.64	{$0}, [$4,:128], $5
-    vst1.64	{$1}, [$4,:128], $5
-    vst1.64	{$2}, [$4,:128], $5
-    vst1.64	{$3}, [$4,:128], $5
-//	}
+//  {   //  input: $0~$3, dst*, dst_stride
+    vst1.64 {$0}, [$4,:128], $5
+    vst1.64 {$1}, [$4,:128], $5
+    vst1.64 {$2}, [$4,:128], $5
+    vst1.64 {$3}, [$4,:128], $5
+//  }
 .endm
 
 .macro LOAD_UNALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, src*, src_stride
-    vld1.64	{$0}, [$4], $5
-    vld1.64	{$1}, [$4], $5
-    vld1.64	{$2}, [$4], $5
-    vld1.64	{$3}, [$4], $5
-//	}
+//  {   //  input: $0~$3, src*, src_stride
+    vld1.64 {$0}, [$4], $5
+    vld1.64 {$1}, [$4], $5
+    vld1.64 {$2}, [$4], $5
+    vld1.64 {$3}, [$4], $5
+//  }
 .endm
 
 .macro STORE_UNALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, dst*, dst_stride
-    vst1.64	{$0}, [$4], $5
-    vst1.64	{$1}, [$4], $5
-    vst1.64	{$2}, [$4], $5
-    vst1.64	{$3}, [$4], $5
-//	}
+//  {   //  input: $0~$3, dst*, dst_stride
+    vst1.64 {$0}, [$4], $5
+    vst1.64 {$1}, [$4], $5
+    vst1.64 {$2}, [$4], $5
+    vst1.64 {$3}, [$4], $5
+//  }
 .endm
 #else
 .macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, src*, src_stride
-    vld1.64	{\arg0}, [\arg4,:128], \arg5
-    vld1.64	{\arg1}, [\arg4,:128], \arg5
-    vld1.64	{\arg2}, [\arg4,:128], \arg5
-    vld1.64	{\arg3}, [\arg4,:128], \arg5
-//	}
+//  {   //  input: \arg0~\arg3, src*, src_stride
+    vld1.64 {\arg0}, [\arg4,:128], \arg5
+    vld1.64 {\arg1}, [\arg4,:128], \arg5
+    vld1.64 {\arg2}, [\arg4,:128], \arg5
+    vld1.64 {\arg3}, [\arg4,:128], \arg5
+//  }
 .endm
 
 .macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, dst*, dst_stride
-    vst1.64	{\arg0}, [\arg4,:128], \arg5
-    vst1.64	{\arg1}, [\arg4,:128], \arg5
-    vst1.64	{\arg2}, [\arg4,:128], \arg5
-    vst1.64	{\arg3}, [\arg4,:128], \arg5
-//	}
+//  {   //  input: \arg0~\arg3, dst*, dst_stride
+    vst1.64 {\arg0}, [\arg4,:128], \arg5
+    vst1.64 {\arg1}, [\arg4,:128], \arg5
+    vst1.64 {\arg2}, [\arg4,:128], \arg5
+    vst1.64 {\arg3}, [\arg4,:128], \arg5
+//  }
 .endm
 
 .macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, src*, src_stride
-    vld1.64	{\arg0}, [\arg4], \arg5
-    vld1.64	{\arg1}, [\arg4], \arg5
-    vld1.64	{\arg2}, [\arg4], \arg5
-    vld1.64	{\arg3}, [\arg4], \arg5
-//	}
+//  {   //  input: \arg0~\arg3, src*, src_stride
+    vld1.64 {\arg0}, [\arg4], \arg5
+    vld1.64 {\arg1}, [\arg4], \arg5
+    vld1.64 {\arg2}, [\arg4], \arg5
+    vld1.64 {\arg3}, [\arg4], \arg5
+//  }
 .endm
 
 .macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, dst*, dst_stride
-    vst1.64	{\arg0}, [\arg4], \arg5
-    vst1.64	{\arg1}, [\arg4], \arg5
-    vst1.64	{\arg2}, [\arg4], \arg5
-    vst1.64	{\arg3}, [\arg4], \arg5
-//	}
+//  {   //  input: \arg0~\arg3, dst*, dst_stride
+    vst1.64 {\arg0}, [\arg4], \arg5
+    vst1.64 {\arg1}, [\arg4], \arg5
+    vst1.64 {\arg2}, [\arg4], \arg5
+    vst1.64 {\arg3}, [\arg4], \arg5
+//  }
 .endm
 
 #endif
@@ -112,13 +112,13 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d0, d1, d2, d3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d4, d5, d6, d7, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -125,21 +125,21 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q0, q1, q2, q3, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q0, q1, q2, q3, r0, r1
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q8, q9, q10, q11, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q8, q9, q10, q11, r0, r1
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q0, q1, q2, q3, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q0, q1, q2, q3, r0, r1
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q8, q9, q10, q11, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q8, q9, q10, q11, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -146,21 +146,21 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q0, q1, q2, q3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q8, q9, q10, q11, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q0, q1, q2, q3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q8, q9, q10, q11, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -167,13 +167,13 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q0, q1, q2, q3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q8, q9, q10, q11, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -180,21 +180,21 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d0, d1, d2, d3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d4, d5, d6, d7, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d0, d1, d2, d3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d4, d5, d6, d7, r0, r1
 
 WELS_ASM_FUNC_END
 
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -37,814 +37,814 @@
 
 #ifdef __APPLE__
 .macro JMP_IF_128BITS_IS_ZERO
-    vorr.s16	$2, $0, $1
-    vmov		r3, r2, $2
-    orr			r3, r3, r2
-    cmp			r3, #0
+    vorr.s16    $2, $0, $1
+    vmov        r3, r2, $2
+    orr         r3, r3, r2
+    cmp         r3, #0
 .endm
 
 .macro MASK_MATRIX
-    vabd.u8	$6, $1, $2
-    vcgt.u8	$6, $4, $6
+    vabd.u8 $6, $1, $2
+    vcgt.u8 $6, $4, $6
 
-    vabd.u8	$4, $0, $1
-    vclt.u8	$4, $4, $5
-    vand.u8	$6, $6, $4
+    vabd.u8 $4, $0, $1
+    vclt.u8 $4, $4, $5
+    vand.u8 $6, $6, $4
 
-    vabd.u8	$4, $3, $2
-    vclt.u8	$4, $4, $5
-    vand.u8	$6, $6, $4
+    vabd.u8 $4, $3, $2
+    vclt.u8 $4, $4, $5
+    vand.u8 $6, $6, $4
 .endm
 
 
 .macro DIFF_LUMA_LT4_P1_Q1
     vmov.i8 $9, #128
-    vrhadd.u8	$8, $2, $3
-    vhadd.u8	$8, $0, $8
-    vsub.s8	$8, $8, $9
-    vsub.s8	$9, $1, $9
-    vqsub.s8	$8, $8, $9
-    vmax.s8	$8, $8, $5
-    vmin.s8	$8, $8, $6
-    vabd.u8	$9, $0, $2
-    vclt.u8	$9, $9, $4
-    vand.s8	$8, $8, $9
-    vand.s8	$8, $8, $7
-    vadd.u8	$8, $1, $8
-    vabs.s8	$9, $9
+    vrhadd.u8   $8, $2, $3
+    vhadd.u8    $8, $0, $8
+    vsub.s8 $8, $8, $9
+    vsub.s8 $9, $1, $9
+    vqsub.s8    $8, $8, $9
+    vmax.s8 $8, $8, $5
+    vmin.s8 $8, $8, $6
+    vabd.u8 $9, $0, $2
+    vclt.u8 $9, $9, $4
+    vand.s8 $8, $8, $9
+    vand.s8 $8, $8, $7
+    vadd.u8 $8, $1, $8
+    vabs.s8 $9, $9
 .endm
 
 .macro DIFF_LUMA_LT4_P0_Q0
-    vsubl.u8	$5, $0, $3
-    vsubl.u8	$6, $2, $1
-    vshl.s16	$6, $6, #2
-    vadd.s16	$5, $5, $6
-    vqrshrn.s16		$4, $5, #3
+    vsubl.u8    $5, $0, $3
+    vsubl.u8    $6, $2, $1
+    vshl.s16    $6, $6, #2
+    vadd.s16    $5, $5, $6
+    vqrshrn.s16     $4, $5, #3
 .endm
 
 .macro DIFF_LUMA_EQ4_P2P1P0
-    vaddl.u8	q4, $1, $2
-    vaddl.u8	q5, $3, $4
-    vadd.u16	q5, q4, q5
+    vaddl.u8    q4, $1, $2
+    vaddl.u8    q5, $3, $4
+    vadd.u16    q5, q4, q5
 
-    vaddl.u8	q4, $0, $1
-    vshl.u16	q4, q4, #1
-    vadd.u16	q4, q5, q4
+    vaddl.u8    q4, $0, $1
+    vshl.u16    q4, q4, #1
+    vadd.u16    q4, q5, q4
 
-    vrshrn.u16		$0, q5, #2
-    vrshrn.u16		$7, q4, #3
+    vrshrn.u16      $0, q5, #2
+    vrshrn.u16      $7, q4, #3
 
-    vshl.u16	q5, q5, #1
-    vsubl.u8	q4, $5, $1
-    vadd.u16	q5, q4,q5
+    vshl.u16    q5, q5, #1
+    vsubl.u8    q4, $5, $1
+    vadd.u16    q5, q4,q5
 
-    vaddl.u8	q4, $2, $5
-    vaddw.u8	q4, q4, $2
-    vaddw.u8	q4, q4, $3
+    vaddl.u8    q4, $2, $5
+    vaddw.u8    q4, q4, $2
+    vaddw.u8    q4, q4, $3
 
-    vrshrn.u16		d10,q5, #3
-    vrshrn.u16		d8, q4, #2
-    vbsl.u8		$6, d10, d8
+    vrshrn.u16      d10,q5, #3
+    vrshrn.u16      d8, q4, #2
+    vbsl.u8     $6, d10, d8
 .endm
 
 .macro DIFF_LUMA_EQ4_MASK
-    vmov	$3, $2
-    vbsl.u8	$3, $0, $1
+    vmov    $3, $2
+    vbsl.u8 $3, $0, $1
 .endm
 
 .macro DIFF_CHROMA_EQ4_P0Q0
-    vaddl.u8	$4, $0, $3
-    vaddw.u8	$5, $4, $1
-    vaddw.u8	$6, $4, $2
-    vaddw.u8	$5, $5, $0
+    vaddl.u8    $4, $0, $3
+    vaddw.u8    $5, $4, $1
+    vaddw.u8    $6, $4, $2
+    vaddw.u8    $5, $5, $0
 
-    vaddw.u8	$6, $6, $3
-    vrshrn.u16		$7, $5, #2
-    vrshrn.u16		$8, $6, #2
+    vaddw.u8    $6, $6, $3
+    vrshrn.u16      $7, $5, #2
+    vrshrn.u16      $8, $6, #2
 .endm
 
 .macro LOAD_CHROMA_DATA_4
-    vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+    vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+    vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
 .endm
 
 .macro STORE_CHROMA_DATA_4
-    vst4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vst4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+    vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+    vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
 .endm
 
 .macro LOAD_LUMA_DATA_3
-    vld3.u8	{$0[$6],$1[$6],$2[$6]}, [r2], r1
-    vld3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
+    vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
+    vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
 .endm
 
 .macro STORE_LUMA_DATA_4
-    vst4.u8	{$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
-    vst4.u8	{$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+    vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
+    vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
 .endm
 
 .macro STORE_LUMA_DATA_3
-    vst3.u8	{$0[$6],$1[$6],$2[$6]}, [r3], r1
-    vst3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
+    vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
+    vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
 .endm
 
 .macro EXTRACT_DELTA_INTO_TWO_PART
-    vcge.s8	$1, $0, #0
-    vand	$1, $0, $1
-    vsub.s8	$0, $1, $0
+    vcge.s8 $1, $0, #0
+    vand    $1, $0, $1
+    vsub.s8 $0, $1, $0
 .endm
 #else
 .macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
-    vorr.s16	\arg2, \arg0, \arg1
-    vmov		r3, r2, \arg2
-    orr			r3, r3, r2
-    cmp			r3, #0
+    vorr.s16    \arg2, \arg0, \arg1
+    vmov        r3, r2, \arg2
+    orr         r3, r3, r2
+    cmp         r3, #0
 .endm
 
 .macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vabd.u8	\arg6, \arg1, \arg2
-    vcgt.u8	\arg6, \arg4, \arg6
+    vabd.u8 \arg6, \arg1, \arg2
+    vcgt.u8 \arg6, \arg4, \arg6
 
-    vabd.u8	\arg4, \arg0, \arg1
-    vclt.u8	\arg4, \arg4, \arg5
-    vand.u8	\arg6, \arg6, \arg4
+    vabd.u8 \arg4, \arg0, \arg1
+    vclt.u8 \arg4, \arg4, \arg5
+    vand.u8 \arg6, \arg6, \arg4
 
-    vabd.u8	\arg4, \arg3, \arg2
-    vclt.u8	\arg4, \arg4, \arg5
-    vand.u8	\arg6, \arg6, \arg4
+    vabd.u8 \arg4, \arg3, \arg2
+    vclt.u8 \arg4, \arg4, \arg5
+    vand.u8 \arg6, \arg6, \arg4
 .endm
 
 .macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
     vmov.i8 \arg9, #128
-    vrhadd.u8	\arg8, \arg2, \arg3
-    vhadd.u8	\arg8, \arg0, \arg8
-    vsub.s8	\arg8, \arg8, \arg9
-    vsub.s8	\arg9, \arg1, \arg9
+    vrhadd.u8   \arg8, \arg2, \arg3
+    vhadd.u8    \arg8, \arg0, \arg8
+    vsub.s8 \arg8, \arg8, \arg9
+    vsub.s8 \arg9, \arg1, \arg9
     vqsub.s8    \arg8, \arg8, \arg9
-    vmax.s8	\arg8, \arg8, \arg5
-    vmin.s8	\arg8, \arg8, \arg6
-    vabd.u8	\arg9, \arg0, \arg2
-    vclt.u8	\arg9, \arg9, \arg4
-    vand.s8	\arg8, \arg8, \arg9
-    vand.s8	\arg8, \arg8, \arg7
-    vadd.u8	\arg8, \arg1, \arg8
-    vabs.s8	\arg9, \arg9
+    vmax.s8 \arg8, \arg8, \arg5
+    vmin.s8 \arg8, \arg8, \arg6
+    vabd.u8 \arg9, \arg0, \arg2
+    vclt.u8 \arg9, \arg9, \arg4
+    vand.s8 \arg8, \arg8, \arg9
+    vand.s8 \arg8, \arg8, \arg7
+    vadd.u8 \arg8, \arg1, \arg8
+    vabs.s8 \arg9, \arg9
 .endm
 
 .macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vsubl.u8	\arg5, \arg0, \arg3
-    vsubl.u8	\arg6, \arg2, \arg1
-    vshl.s16	\arg6, \arg6, #2
-    vadd.s16	\arg5, \arg5, \arg6
-    vqrshrn.s16		\arg4, \arg5, #3
+    vsubl.u8    \arg5, \arg0, \arg3
+    vsubl.u8    \arg6, \arg2, \arg1
+    vshl.s16    \arg6, \arg6, #2
+    vadd.s16    \arg5, \arg5, \arg6
+    vqrshrn.s16     \arg4, \arg5, #3
 .endm
 
 
 .macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-    vaddl.u8	q4, \arg1, \arg2
-    vaddl.u8	q5, \arg3, \arg4
-    vadd.u16	q5, q4, q5
+    vaddl.u8    q4, \arg1, \arg2
+    vaddl.u8    q5, \arg3, \arg4
+    vadd.u16    q5, q4, q5
 
-    vaddl.u8	q4, \arg0, \arg1
-    vshl.u16	q4, q4, #1
-    vadd.u16	q4, q5, q4
+    vaddl.u8    q4, \arg0, \arg1
+    vshl.u16    q4, q4, #1
+    vadd.u16    q4, q5, q4
 
-    vrshrn.u16		\arg0, q5, #2
-    vrshrn.u16		\arg7, q4, #3
+    vrshrn.u16      \arg0, q5, #2
+    vrshrn.u16      \arg7, q4, #3
 
-    vshl.u16	q5, q5, #1
-    vsubl.u8	q4, \arg5, \arg1
-    vadd.u16	q5, q4,q5
+    vshl.u16    q5, q5, #1
+    vsubl.u8    q4, \arg5, \arg1
+    vadd.u16    q5, q4,q5
 
-    vaddl.u8	q4, \arg2, \arg5
-    vaddw.u8	q4, q4, \arg2
-    vaddw.u8	q4, q4, \arg3
+    vaddl.u8    q4, \arg2, \arg5
+    vaddw.u8    q4, q4, \arg2
+    vaddw.u8    q4, q4, \arg3
 
-    vrshrn.u16		d10,q5, #3
-    vrshrn.u16		d8, q4, #2
-    vbsl.u8		\arg6, d10, d8
+    vrshrn.u16      d10,q5, #3
+    vrshrn.u16      d8, q4, #2
+    vbsl.u8     \arg6, d10, d8
 .endm
 
 .macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
-    vmov	\arg3, \arg2
-    vbsl.u8	\arg3, \arg0, \arg1
+    vmov    \arg3, \arg2
+    vbsl.u8 \arg3, \arg0, \arg1
 .endm
 
 .macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vaddl.u8	\arg4, \arg0, \arg3
-    vaddw.u8	\arg5, \arg4, \arg1
-    vaddw.u8	\arg6, \arg4, \arg2
-    vaddw.u8	\arg5, \arg5, \arg0
-    vaddw.u8	\arg6, \arg6, \arg3
-    vrshrn.u16		\arg7, \arg5, #2
-    vrshrn.u16		\arg8, \arg6, #2
+    vaddl.u8    \arg4, \arg0, \arg3
+    vaddw.u8    \arg5, \arg4, \arg1
+    vaddw.u8    \arg6, \arg4, \arg2
+    vaddw.u8    \arg5, \arg5, \arg0
+    vaddw.u8    \arg6, \arg6, \arg3
+    vrshrn.u16      \arg7, \arg5, #2
+    vrshrn.u16      \arg8, \arg6, #2
 .endm
 
 .macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
-    vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+    vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+    vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
 .endm
 
 .macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vst4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
-    vst4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+    vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+    vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
 .endm
 
 .macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vld3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
-    vld3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+    vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
+    vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
 .endm
 
 .macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
-    vst4.u8	{\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
-    vst4.u8	{\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+    vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
+    vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
 .endm
 
 .macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vst3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
-    vst3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+    vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
+    vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
 .endm
 
 .macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
-    vcge.s8	\arg1, \arg0, #0
-    vand	\arg1, \arg0, \arg1
-    vsub.s8	\arg0, \arg1, \arg0
+    vcge.s8 \arg1, \arg0, #0
+    vand    \arg1, \arg0, \arg1
+    vsub.s8 \arg0, \arg1, \arg0
 .endm
 #endif
 
 WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
-    vpush	{q4-q7}
-    vdup.u8	q11, r2
-    vdup.u8	q9, r3
+    vpush   {q4-q7}
+    vdup.u8 q11, r2
+    vdup.u8 q9, r3
 
-    add			r2, r1, r1, lsl #1
-    sub			r2, r0, r2
-    vld1.u8	{q0}, [r2], r1
-    vld1.u8	{q3}, [r0], r1
-    vld1.u8	{q1}, [r2], r1
-    vld1.u8	{q4}, [r0], r1
-    vld1.u8	{q2}, [r2]
-    vld1.u8	{q5}, [r0]
-    sub			r2, r2, r1
+    add         r2, r1, r1, lsl #1
+    sub         r2, r0, r2
+    vld1.u8 {q0}, [r2], r1
+    vld1.u8 {q3}, [r0], r1
+    vld1.u8 {q1}, [r2], r1
+    vld1.u8 {q4}, [r0], r1
+    vld1.u8 {q2}, [r2]
+    vld1.u8 {q5}, [r0]
+    sub         r2, r2, r1
 
-    ldr			r3, [sp, #64]
-    vld1.s8	{d31}, [r3]
-    vdup.s8	d28, d31[0]
-    vdup.s8	d30, d31[1]
-    vdup.s8	d29, d31[2]
-    vdup.s8	d31, d31[3]
-    vtrn.32	d28, d30
-    vtrn.32	d29, d31
-    vcge.s8	q10, q14, #0
+    ldr         r3, [sp, #64]
+    vld1.s8 {d31}, [r3]
+    vdup.s8 d28, d31[0]
+    vdup.s8 d30, d31[1]
+    vdup.s8 d29, d31[2]
+    vdup.s8 d31, d31[3]
+    vtrn.32 d28, d30
+    vtrn.32 d29, d31
+    vcge.s8 q10, q14, #0
 
-    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
-    vand.u8	q10, q10, q15
+    MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+    vand.u8 q10, q10, q15
 
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
-    vst1.u8	{q6}, [r2], r1
+    DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+    vst1.u8 {q6}, [r2], r1
 
-    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+    DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
 
-    vabs.s8	q12, q12
-    vabs.s8	q13, q13
-    vadd.u8	q14,q14,q12
-    vadd.u8	q14,q14,q13
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vabs.s8 q12, q12
+    vabs.s8 q13, q13
+    vadd.u8 q14,q14,q12
+    vadd.u8 q14,q14,q13
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-    vand.s8	q8, q8, q10
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
-    vqadd.u8	q2, q2, q9
-    vqsub.u8	q2, q2, q8
-    vst1.u8	{q2}, [r2], r1
-    vqsub.u8	q3, q3, q9
-    vqadd.u8	q3, q3, q8
-    vst1.u8	{q3}, [r2]	, r1
-    vst1.u8	{q7}, [r2]
+    DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
+    vand.s8 q8, q8, q10
+    EXTRACT_DELTA_INTO_TWO_PART q8, q9
+    vqadd.u8    q2, q2, q9
+    vqsub.u8    q2, q2, q8
+    vst1.u8 {q2}, [r2], r1
+    vqsub.u8    q3, q3, q9
+    vqadd.u8    q3, q3, q8
+    vst1.u8 {q3}, [r2]  , r1
+    vst1.u8 {q7}, [r2]
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
-    vpush	{q4-q7}
+    vpush   {q4-q7}
 
-    vdup.u8	q5, r2
-    vdup.u8	q4, r3
+    vdup.u8 q5, r2
+    vdup.u8 q4, r3
 
-    sub			r3, r0, r1, lsl #2
-    vld1.u8	{q8},  [r3], r1
-    vld1.u8	{q12}, [r0], r1
-    vld1.u8	{q9},  [r3], r1
-    vld1.u8	{q13}, [r0], r1
-    vld1.u8	{q10}, [r3], r1
-    vld1.u8	{q14}, [r0], r1
-    vld1.u8	{q11}, [r3]
-    vld1.u8	{q15}, [r0]
-    sub			r3, r3, r1	, lsl #1
+    sub         r3, r0, r1, lsl #2
+    vld1.u8 {q8},  [r3], r1
+    vld1.u8 {q12}, [r0], r1
+    vld1.u8 {q9},  [r3], r1
+    vld1.u8 {q13}, [r0], r1
+    vld1.u8 {q10}, [r3], r1
+    vld1.u8 {q14}, [r0], r1
+    vld1.u8 {q11}, [r3]
+    vld1.u8 {q15}, [r0]
+    sub         r3, r3, r1  , lsl #1
 
-    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
+    MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
 
-    mov			r2, r2, lsr #2
-    add			r2, r2, #2
-    vdup.u8	q5, r2
-    vabd.u8	q0, q11, q12
-    vclt.u8	q7, q0, q5
+    mov         r2, r2, lsr #2
+    add         r2, r2, #2
+    vdup.u8 q5, r2
+    vabd.u8 q0, q11, q12
+    vclt.u8 q7, q0, q5
 
-    vabd.u8	q1, q9, q11
-    vclt.u8	q1, q1, q4
-    vand.s8	q1, q1, q7
+    vabd.u8 q1, q9, q11
+    vclt.u8 q1, q1, q4
+    vand.s8 q1, q1, q7
 
-    vabd.u8	q2, q14,q12
-    vclt.u8	q2, q2, q4
-    vand.s8	q2, q2, q7
-    vand.u8	q7, q7, q6
+    vabd.u8 q2, q14,q12
+    vclt.u8 q2, q2, q4
+    vand.s8 q2, q2, q7
+    vand.u8 q7, q7, q6
 
-    vmov		q3, q1
+    vmov        q3, q1
 
-    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
-    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
+    DIFF_LUMA_EQ4_P2P1P0        d16, d18, d20, d22, d24, d26, d2, d0
+    DIFF_LUMA_EQ4_P2P1P0        d17, d19, d21, d23, d25, d27, d3, d1
 
-    vand.u8	q3, q7, q3
-    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q8,q10, q3, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q4
-    vst1.u8	{q4}, [r3], r1
+    vand.u8 q3, q7, q3
+    DIFF_LUMA_EQ4_MASK  q0, q9, q3, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q8,q10, q3, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q1,q11, q6, q4
+    vst1.u8 {q4}, [r3], r1
 
-    vmov		q0, q2
-    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d6
-    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d7
+    vmov        q0, q2
+    DIFF_LUMA_EQ4_P2P1P0        d30, d28, d26, d24, d22, d20, d4, d6
+    DIFF_LUMA_EQ4_P2P1P0        d31, d29, d27, d25, d23, d21, d5, d7
 
-    vand.u8	q0, q7, q0
-    DIFF_LUMA_EQ4_MASK	q2,  q12, q6, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q15, q13, q0, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q3,  q14, q0, q4
-    vst1.u8	{q4}, [r3], r1
+    vand.u8 q0, q7, q0
+    DIFF_LUMA_EQ4_MASK  q2,  q12, q6, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q15, q13, q0, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q3,  q14, q0, q4
+    vst1.u8 {q4}, [r3], r1
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
-    vpush	{q4-q7}
+    vpush   {q4-q7}
 
-    vdup.u8	q11, r2
-    vdup.u8	q9, r3
+    vdup.u8 q11, r2
+    vdup.u8 q9, r3
 
-    sub			r2, r0, #3
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 0
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 1
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 2
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 3
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 4
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 5
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 6
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 7
+    sub         r2, r0, #3
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 0
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 1
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 2
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 3
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 4
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 5
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 6
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 7
 
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 0
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 1
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 2
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 3
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 4
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 5
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 6
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 7
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 0
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 1
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 2
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 3
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 4
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 5
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 6
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 7
 
-    vswp		d1, d2
-    vswp		d3, d4
-    vswp		d1, d4
-    vswp		d7, d8
-    vswp		d9, d10
-    vswp		d7, d10
+    vswp        d1, d2
+    vswp        d3, d4
+    vswp        d1, d4
+    vswp        d7, d8
+    vswp        d9, d10
+    vswp        d7, d10
 
-    sub			r0, r0, r1, lsl #4
+    sub         r0, r0, r1, lsl #4
 
-    ldr			r3, [sp, #64]
-    vld1.s8	{d31}, [r3]
-    vdup.s8	d28, d31[0]
-    vdup.s8	d30, d31[1]
-    vdup.s8	d29, d31[2]
-    vdup.s8	d31, d31[3]
-    vtrn.32	d28, d30
-    vtrn.32	d29, d31
-    vcge.s8	q10, q14, #0
+    ldr         r3, [sp, #64]
+    vld1.s8 {d31}, [r3]
+    vdup.s8 d28, d31[0]
+    vdup.s8 d30, d31[1]
+    vdup.s8 d29, d31[2]
+    vdup.s8 d31, d31[3]
+    vtrn.32 d28, d30
+    vtrn.32 d29, d31
+    vcge.s8 q10, q14, #0
 
-    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
-    vand.u8	q10, q10, q15
+    MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+    vand.u8 q10, q10, q15
 
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
-    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+    DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+    DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
 
-    vabs.s8	q12, q12
-    vabs.s8	q13, q13
-    vadd.u8	q14,q14,q12
-    vadd.u8	q14,q14,q13
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vabs.s8 q12, q12
+    vabs.s8 q13, q13
+    vadd.u8 q14,q14,q12
+    vadd.u8 q14,q14,q13
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-    vand.s8	q8, q8, q10
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
-    vqadd.u8	q2, q2, q9
-    vqsub.u8	q2, q2, q8
+    DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
+    vand.s8 q8, q8, q10
+    EXTRACT_DELTA_INTO_TWO_PART q8, q9
+    vqadd.u8    q2, q2, q9
+    vqsub.u8    q2, q2, q8
 
-    vqsub.u8	q3, q3, q9
-    vqadd.u8	q3, q3, q8
+    vqsub.u8    q3, q3, q9
+    vqadd.u8    q3, q3, q8
 
-    sub		r0, #2
-    add		r2, r0, r1
-    lsl		r1, #1
+    sub     r0, #2
+    add     r2, r0, r1
+    lsl     r1, #1
 
-    vmov		q1, q6
-    vmov		q4, q7
+    vmov        q1, q6
+    vmov        q4, q7
 
-    vswp		q2, q3
-    vswp		d3, d6
-    vswp		d5, d8
+    vswp        q2, q3
+    vswp        d3, d6
+    vswp        d5, d8
 
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 0, 1
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 2, 3
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 4, 5
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 6, 7
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 0, 1
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 2, 3
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 4, 5
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 6, 7
 
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 0, 1
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 2, 3
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 4, 5
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 6, 7
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 0, 1
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 2, 3
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 4, 5
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 6, 7
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
-    vpush	{q4-q7}
-    vdup.u8	q5, r2
-    vdup.u8	q4, r3
+    vpush   {q4-q7}
+    vdup.u8 q5, r2
+    vdup.u8 q4, r3
 
-    sub			r3, r0, #4				//	pix -= 4
+    sub         r3, r0, #4              //  pix -= 4
 
-    vld1.u8	{d16}, [r3], r1
-    vld1.u8	{d17}, [r3], r1
-    vld1.u8	{d18}, [r3], r1
-    vld1.u8	{d19}, [r3], r1
-    vld1.u8	{d20}, [r3], r1
-    vld1.u8	{d21}, [r3], r1
-    vld1.u8	{d22}, [r3], r1
-    vld1.u8	{d23}, [r3], r1
-    vld1.u8	{d24}, [r3], r1
-    vld1.u8	{d25}, [r3], r1
-    vld1.u8	{d26}, [r3], r1
-    vld1.u8	{d27}, [r3], r1
-    vld1.u8	{d28}, [r3], r1
-    vld1.u8	{d29}, [r3], r1
-    vld1.u8	{d30}, [r3], r1
-    vld1.u8	{d31}, [r3], r1
+    vld1.u8 {d16}, [r3], r1
+    vld1.u8 {d17}, [r3], r1
+    vld1.u8 {d18}, [r3], r1
+    vld1.u8 {d19}, [r3], r1
+    vld1.u8 {d20}, [r3], r1
+    vld1.u8 {d21}, [r3], r1
+    vld1.u8 {d22}, [r3], r1
+    vld1.u8 {d23}, [r3], r1
+    vld1.u8 {d24}, [r3], r1
+    vld1.u8 {d25}, [r3], r1
+    vld1.u8 {d26}, [r3], r1
+    vld1.u8 {d27}, [r3], r1
+    vld1.u8 {d28}, [r3], r1
+    vld1.u8 {d29}, [r3], r1
+    vld1.u8 {d30}, [r3], r1
+    vld1.u8 {d31}, [r3], r1
 
-    vtrn.u32	d16, d20
-    vtrn.u32	d17, d21
-    vtrn.u32	d18, d22
-    vtrn.u32	d19, d23
-    vtrn.u32	d24, d28
-    vtrn.u32	d25, d29
-    vtrn.u32	d26, d30
-    vtrn.u32	d27, d31
+    vtrn.u32    d16, d20
+    vtrn.u32    d17, d21
+    vtrn.u32    d18, d22
+    vtrn.u32    d19, d23
+    vtrn.u32    d24, d28
+    vtrn.u32    d25, d29
+    vtrn.u32    d26, d30
+    vtrn.u32    d27, d31
 
-    vtrn.u16	d16, d18
-    vtrn.u16	d17, d19
-    vtrn.u16	d20, d22
-    vtrn.u16	d21, d23
-    vtrn.u16	d24, d26
-    vtrn.u16	d25, d27
-    vtrn.u16	d28, d30
-    vtrn.u16	d29, d31
+    vtrn.u16    d16, d18
+    vtrn.u16    d17, d19
+    vtrn.u16    d20, d22
+    vtrn.u16    d21, d23
+    vtrn.u16    d24, d26
+    vtrn.u16    d25, d27
+    vtrn.u16    d28, d30
+    vtrn.u16    d29, d31
 
-    vtrn.u8	d16, d17
-    vtrn.u8	d18, d19
-    vtrn.u8	d20, d21
-    vtrn.u8	d22, d23
-    vtrn.u8	d24, d25
-    vtrn.u8	d26, d27
-    vtrn.u8	d28, d29
-    vtrn.u8	d30, d31
+    vtrn.u8 d16, d17
+    vtrn.u8 d18, d19
+    vtrn.u8 d20, d21
+    vtrn.u8 d22, d23
+    vtrn.u8 d24, d25
+    vtrn.u8 d26, d27
+    vtrn.u8 d28, d29
+    vtrn.u8 d30, d31
 
-    vswp	d17, d24
-    vswp	d19, d26
-    vswp	d21, d28
-    vswp	d23, d30
+    vswp    d17, d24
+    vswp    d19, d26
+    vswp    d21, d28
+    vswp    d23, d30
 
-    vswp	q12, q9
-    vswp	q14, q11
+    vswp    q12, q9
+    vswp    q14, q11
 
-    vswp	q12, q10
-    vswp	q13, q11
+    vswp    q12, q10
+    vswp    q13, q11
 
-    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
+    MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
 
-    mov			r2, r2, lsr #2
-    add			r2, r2, #2
-    vdup.u8	q5, r2
-    vabd.u8	q0, q11, q12
-    vclt.u8	q7, q0, q5
+    mov         r2, r2, lsr #2
+    add         r2, r2, #2
+    vdup.u8 q5, r2
+    vabd.u8 q0, q11, q12
+    vclt.u8 q7, q0, q5
 
-    vabd.u8	q1, q9, q11
-    vclt.u8	q1, q1, q4
-    vand.s8	q1, q1, q7
+    vabd.u8 q1, q9, q11
+    vclt.u8 q1, q1, q4
+    vand.s8 q1, q1, q7
 
-    vabd.u8	q2, q14,q12
-    vclt.u8	q2, q2, q4
-    vand.s8	q2, q2, q7
-    vand.u8	q7, q7, q6
+    vabd.u8 q2, q14,q12
+    vclt.u8 q2, q2, q4
+    vand.s8 q2, q2, q7
+    vand.u8 q7, q7, q6
 
-    vmov		q3, q1
+    vmov        q3, q1
 
-    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
-    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
+    DIFF_LUMA_EQ4_P2P1P0        d16, d18, d20, d22, d24, d26, d2, d0
+    DIFF_LUMA_EQ4_P2P1P0        d17, d19, d21, d23, d25, d27, d3, d1
 
-    vand.u8	q3, q7, q3
-    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
-    vmov		q9, q4
-    vbsl.u8	q3, q8, q10
-    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q8
+    vand.u8 q3, q7, q3
+    DIFF_LUMA_EQ4_MASK  q0, q9, q3, q4
+    vmov        q9, q4
+    vbsl.u8 q3, q8, q10
+    DIFF_LUMA_EQ4_MASK  q1,q11, q6, q8
 
-    vand.u8	q7, q7, q2
+    vand.u8 q7, q7, q2
 
-    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d0
-    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d1
+    DIFF_LUMA_EQ4_P2P1P0        d30, d28, d26, d24, d22, d20, d4, d0
+    DIFF_LUMA_EQ4_P2P1P0        d31, d29, d27, d25, d23, d21, d5, d1
 
-    vbsl.u8	q6, q2, q12
-    DIFF_LUMA_EQ4_MASK	q15, q13, q7, q4
+    vbsl.u8 q6, q2, q12
+    DIFF_LUMA_EQ4_MASK  q15, q13, q7, q4
 
-    vbsl.u8	q7, q0, q14
+    vbsl.u8 q7, q0, q14
 
-    vmov		q5, q6
-    vmov		q2, q9
-    vmov		q6, q4
-    vmov		q4, q8
+    vmov        q5, q6
+    vmov        q2, q9
+    vmov        q6, q4
+    vmov        q4, q8
 
-    vswp	d8, d6
-    vswp	d5, d7
-    vswp	d5, d8
-    vswp	d14, d12
-    vswp	d11, d13
-    vswp	d11, d14
+    vswp    d8, d6
+    vswp    d5, d7
+    vswp    d5, d8
+    vswp    d14, d12
+    vswp    d11, d13
+    vswp    d11, d14
 
-    sub		r3, r0, #3
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,0
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,1
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,2
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,3
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,4
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,5
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,6
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,7
+    sub     r3, r0, #3
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,0
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,1
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,2
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,3
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,4
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,5
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,6
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,7
 
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,0
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,1
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,2
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,3
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,4
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,5
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,6
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,7
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,0
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,1
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,2
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,3
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,4
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,5
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,6
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,7
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #0]
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #0]
 
-    sub			r0, r0, r2	, lsl #1
-    sub			r1, r1, r2, lsl #1
-    vdup.u8	    q9, r3
-    ldr			r3, [sp, #4]
+    sub         r0, r0, r2  , lsl #1
+    sub         r1, r1, r2, lsl #1
+    vdup.u8     q9, r3
+    ldr         r3, [sp, #4]
 
-    vld1.u8	{d0}, [r0], r2
-    vld1.u8	{d1}, [r1], r2
-    vld1.u8	{d2}, [r0], r2
-    vld1.u8	{d3}, [r1], r2
-    vld1.u8	{d4}, [r0], r2
-    vld1.u8	{d5}, [r1], r2
-    vld1.u8	{d6}, [r0]
-    vld1.u8	{d7}, [r1]
+    vld1.u8 {d0}, [r0], r2
+    vld1.u8 {d1}, [r1], r2
+    vld1.u8 {d2}, [r0], r2
+    vld1.u8 {d3}, [r1], r2
+    vld1.u8 {d4}, [r0], r2
+    vld1.u8 {d5}, [r1], r2
+    vld1.u8 {d6}, [r0]
+    vld1.u8 {d7}, [r1]
 
-    sub			r0, r0, r2, lsl #1
-    sub			r1, r1, r2, lsl #1
+    sub         r0, r0, r2, lsl #1
+    sub         r1, r1, r2, lsl #1
 
-    vld1.s8	{d31}, [r3]
-    vmovl.u8	q14,d31
-    vshl.u64	d29,d28,#8
-    vorr		d28,d29
-    vmov		d29, d28
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vld1.s8 {d31}, [r3]
+    vmovl.u8    q14,d31
+    vshl.u64    d29,d28,#8
+    vorr        d28,d29
+    vmov        d29, d28
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
 
-    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
+    DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
 
-    vand.s8	q8, q8, q10
-    vcge.s8	q14, q14, #0
-    vand.s8	q8, q8, q14
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
-    vqadd.u8	q1, q1, q10
-    vqsub.u8	q1, q1, q8
-    vst1.u8	{d2}, [r0], r2
-    vst1.u8	{d3}, [r1], r2
-    vqsub.u8	q2, q2, q10
-    vqadd.u8	q2, q2, q8
-    vst1.u8	{d4}, [r0]
-    vst1.u8	{d5}, [r1]
+    vand.s8 q8, q8, q10
+    vcge.s8 q14, q14, #0
+    vand.s8 q8, q8, q14
+    EXTRACT_DELTA_INTO_TWO_PART q8, q10
+    vqadd.u8    q1, q1, q10
+    vqsub.u8    q1, q1, q8
+    vst1.u8 {d2}, [r0], r2
+    vst1.u8 {d3}, [r1], r2
+    vqsub.u8    q2, q2, q10
+    vqadd.u8    q2, q2, q8
+    vst1.u8 {d4}, [r0]
+    vst1.u8 {d5}, [r1]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
-    vpush	{q4-q5}
+    vpush   {q4-q5}
 
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #32]
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #32]
 
-    sub			r0, r0, r2	, lsl #1
-    sub			r1, r1, r2, lsl #1
-    vdup.u8	q9, r3
-    vld1.u8	{d0}, [r0], r2		//	q0::p1
-    vld1.u8	{d1}, [r1], r2
-    vld1.u8	{d2}, [r0], r2		//	q1::p0
-    vld1.u8	{d3}, [r1], r2
-    vld1.u8	{d4}, [r0], r2		//	q2::q0
-    vld1.u8	{d5}, [r1], r2
-    vld1.u8	{d6}, [r0]				//	q3::q1
-    vld1.u8	{d7}, [r1]
+    sub         r0, r0, r2  , lsl #1
+    sub         r1, r1, r2, lsl #1
+    vdup.u8 q9, r3
+    vld1.u8 {d0}, [r0], r2      //  q0::p1
+    vld1.u8 {d1}, [r1], r2
+    vld1.u8 {d2}, [r0], r2      //  q1::p0
+    vld1.u8 {d3}, [r1], r2
+    vld1.u8 {d4}, [r0], r2      //  q2::q0
+    vld1.u8 {d5}, [r1], r2
+    vld1.u8 {d6}, [r0]              //  q3::q1
+    vld1.u8 {d7}, [r1]
 
-    sub			r0, r0, r2, lsl #1	//	pix = [-1*src_stride]
-    sub			r1, r1, r2, lsl #1
+    sub         r0, r0, r2, lsl #1  //  pix = [-1*src_stride]
+    sub         r1, r1, r2, lsl #1
 
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
 
-    vmov			q11, q10
+    vmov            q11, q10
 
-    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q4, q5, q8, d30, d0		// Cb::p0' q0'
-    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q12, q13, q14, d31, d1	// Cr::p0' q0'
+    DIFF_CHROMA_EQ4_P0Q0        d0, d2, d4, d6, q4, q5, q8, d30, d0     // Cb::p0' q0'
+    DIFF_CHROMA_EQ4_P0Q0        d1, d3, d5, d7, q12, q13, q14, d31, d1  // Cr::p0' q0'
 
-    vbsl.u8	q10, q15, q1
-    vst1.u8	{d20}, [r0], r2
-    vst1.u8	{d21}, [r1], r2
+    vbsl.u8 q10, q15, q1
+    vst1.u8 {d20}, [r0], r2
+    vst1.u8 {d21}, [r1], r2
 
-    vbsl.u8	q11, q0, q2
-    vst1.u8	{d22}, [r0]
-    vst1.u8	{d23}, [r1]
+    vbsl.u8 q11, q0, q2
+    vst1.u8 {d22}, [r0]
+    vst1.u8 {d23}, [r1]
 
-    vpop	{q4-q5}
+    vpop    {q4-q5}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
 
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #0]
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #0]
 
-    sub			r0, r0, #2
-    vdup.u8	q9, r3
-    ldr			r3, [sp, #4]
-    sub			r1, r1, #2
-    vld1.s8	{d31}, [r3]
+    sub         r0, r0, #2
+    vdup.u8 q9, r3
+    ldr         r3, [sp, #4]
+    sub         r1, r1, #2
+    vld1.s8 {d31}, [r3]
 
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-    vswp		q1, q2
-    vswp		d1, d2
-    vswp		d6, d5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 0
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 1
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 2
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 3
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 4
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 6
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vswp        q1, q2
+    vswp        d1, d2
+    vswp        d6, d5
 
-    vmovl.u8	q14, d31
-    vshl.u64	d29,d28,#8
-    vorr		d28,d29
-    vmov		d29, d28
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vmovl.u8    q14, d31
+    vshl.u64    d29,d28,#8
+    vorr        d28,d29
+    vmov        d29, d28
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
 
-    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
+    DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
 
-    vand.s8	q8, q8, q10
-    vcge.s8	q14, q14, #0
-    vand.s8	q8, q8, q14
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
-    vqadd.u8	q1, q1, q10
-    vqsub.u8	q1, q1, q8
-    vqsub.u8	q2, q2, q10
-    vqadd.u8	q2, q2, q8
+    vand.s8 q8, q8, q10
+    vcge.s8 q14, q14, #0
+    vand.s8 q8, q8, q14
+    EXTRACT_DELTA_INTO_TWO_PART q8, q10
+    vqadd.u8    q1, q1, q10
+    vqsub.u8    q1, q1, q8
+    vqsub.u8    q2, q2, q10
+    vqadd.u8    q2, q2, q8
 
-    sub			r0, r0, r2, lsl #3
-    sub			r1, r1, r2, lsl #3
-    vswp		d1, d2
-    vswp		d6, d5
-    vswp		q1, q2
+    sub         r0, r0, r2, lsl #3
+    sub         r1, r1, r2, lsl #3
+    vswp        d1, d2
+    vswp        d6, d5
+    vswp        q1, q2
 
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
 
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
-    vpush	{q4-q5}
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #32]
+    vpush   {q4-q5}
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #32]
 
-    sub			r0, r0, #2
-    sub			r1, r1, #2
+    sub         r0, r0, #2
+    sub         r1, r1, #2
 
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-    vswp		q1, q2
-    vswp		d1, d2
-    vswp		d6, d5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 0
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 1
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 2
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 3
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 4
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 6
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vswp        q1, q2
+    vswp        d1, d2
+    vswp        d6, d5
 
-    vdup.u8	q9, r3
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
-    vmov			q11, q10
+    vdup.u8 q9, r3
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+    vmov            q11, q10
 
-    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q8, q9, q12, d8, d10
-    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q13, q14, q15, d9, d11
+    DIFF_CHROMA_EQ4_P0Q0        d0, d2, d4, d6, q8, q9, q12, d8, d10
+    DIFF_CHROMA_EQ4_P0Q0        d1, d3, d5, d7, q13, q14, q15, d9, d11
 
-    vbsl.u8	q10, q4, q1
-    vbsl.u8	q11, q5, q2
-    sub			r0, r0, r2, lsl #3	//	pix: 0th row	[-2]
-    sub			r1, r1, r2, lsl #3
+    vbsl.u8 q10, q4, q1
+    vbsl.u8 q11, q5, q2
+    sub         r0, r0, r2, lsl #3  //  pix: 0th row    [-2]
+    sub         r1, r1, r2, lsl #3
 
-    vmov		q1, q10
-    vmov		q2, q11
-    vswp		d1, d2
-    vswp		d6, d5
-    vswp		q1, q2
-    //	Cb:d0d1d2d3, Cr:d4d5d6d7
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vmov        q1, q10
+    vmov        q2, q11
+    vswp        d1, d2
+    vswp        d6, d5
+    vswp        q1, q2
+    //  Cb:d0d1d2d3, Cr:d4d5d6d7
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
 
-    vpop	{q4-q5}
+    vpop    {q4-q5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
 
-    vld1.64	{d0-d2}, [r0]
+    vld1.64 {d0-d2}, [r0]
 
-    vceq.s8	q0, q0, #0
-    vceq.s8	d2, d2, #0
-    vmvn	q0, q0
-    vmvn	d2, d2
-    vabs.s8	q0, q0
-    vabs.s8	d2, d2
+    vceq.s8 q0, q0, #0
+    vceq.s8 d2, d2, #0
+    vmvn    q0, q0
+    vmvn    d2, d2
+    vabs.s8 q0, q0
+    vabs.s8 d2, d2
 
-    vst1.64	{d0-d2}, [r0]
+    vst1.64 {d0-d2}, [r0]
 WELS_ASM_FUNC_END
 
 #ifdef __APPLE__
@@ -851,37 +851,37 @@
 .macro BS_NZC_CHECK
     vld1.8   {d0,d1}, [$0]
     /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_nzc_check_jump0
+    ands     r6, $1, #2
+    beq      bs_nzc_check_jump0
 
     sub      r6, $0, $2, lsl #4
-	sub      r6, $2, lsl #3
+    sub      r6, $2, lsl #3
     add      r6, #12
     vld1.32  d3[1], [r6]
 
 bs_nzc_check_jump0:
     vext.8   q1, q1, q0, #12
-	vadd.u8  $3, q0, q1
+    vadd.u8  $3, q0, q1
 
 
     /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_nzc_check_jump1
+    ands     r6, $1, #1
+    beq      bs_nzc_check_jump1
 
     sub      r6, $0, #21
-	add      r7, r6, #4
+    add      r7, r6, #4
     vld1.8   d3[4], [r6]
-	add      r6, r7, #4
+    add      r6, r7, #4
     vld1.8   d3[5], [r7]
-	add      r7, r6, #4
+    add      r7, r6, #4
     vld1.8   d3[6], [r6]
     vld1.8   d3[7], [r7]
 
 bs_nzc_check_jump1:
-	vzip.8   d0, d1
-	vzip.8   d0, d1
+    vzip.8   d0, d1
+    vzip.8   d0, d1
     vext.8   q1, q1, q0, #12
-	vadd.u8  $4, q0, q1
+    vadd.u8  $4, q0, q1
 .endm
 
 .macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
@@ -888,7 +888,7 @@
     mov       r6, #4
     vabd.s16  q8, $0, $1
     vabd.s16  q9, $1, $2
-	vdup.s16  $0, r6
+    vdup.s16  $0, r6
     vabd.s16  q10, $2, $3
     vabd.s16  q11, $3, $4
 
@@ -897,7 +897,7 @@
     vcge.s16  q10, $0
     vcge.s16  q11, $0
 
-	vpadd.i16 d16, d16, d17
+    vpadd.i16 d16, d16, d17
     vpadd.i16 d17, d18, d19
     vpadd.i16 d18, d20, d21
     vpadd.i16 d19, d22, d23
@@ -910,8 +910,8 @@
     vldm   $0, {q0,q1,q2,q3}
 
     /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_mv_check_jump0
+    ands     r6, $1, #2
+    beq      bs_mv_check_jump0
 
     sub      r6, $0, $2, lsl #6
     add      r6, #48
@@ -921,22 +921,22 @@
     BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
 
     /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_mv_check_jump1
+    ands     r6, $1, #1
+    beq      bs_mv_check_jump1
 
     sub      r6, $0, #52
     add      r7, r6, #16
-	vld1.32   d8[0], [r6]
-	add      r6, r7, #16
+    vld1.32   d8[0], [r6]
+    add      r6, r7, #16
     vld1.32   d8[1], [r7]
-	add      r7, r6, #16
+    add      r7, r6, #16
     vld1.32   d9[0], [r6]
     vld1.32   d9[1], [r7]
 
 bs_mv_check_jump1:
-	vzip.32   q0, q2
-	vzip.32   q1, q3
-	vzip.32   q0, q1
+    vzip.32   q0, q2
+    vzip.32   q1, q3
+    vzip.32   q0, q1
     vzip.32   q2, q3
     BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
 .endm
@@ -1038,41 +1038,41 @@
 
 WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
 
-	stmdb sp!, {r5-r7}
-	vpush {q4}
+    stmdb sp!, {r5-r7}
+    vpush {q4}
 
-	ldr  r5, [sp, #28]	//Save BS to r5
+    ldr  r5, [sp, #28]  //Save BS to r5
 
-	/* Checking the nzc status */
-	BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
+    /* Checking the nzc status */
+    BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
 
-	/* For checking bS[I] = 2 */
-	mov      r6, #2
-	vcgt.s8  q14, q14, #0
-	vdup.u8  q0, r6
-	vcgt.s8  q15, q15, #0
+    /* For checking bS[I] = 2 */
+    mov      r6, #2
+    vcgt.s8  q14, q14, #0
+    vdup.u8  q0, r6
+    vcgt.s8  q15, q15, #0
 
-	vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
-	vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
+    vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
+    vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
 
-	/* Checking the mv status*/
-	BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
+    /* Checking the mv status*/
+    BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
 
-	/* For checking bS[I] = 1 */
+    /* For checking bS[I] = 1 */
     mov      r6, #1
-	vdup.u8  q0, r6
+    vdup.u8  q0, r6
 
-	vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
-	vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
+    vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
+    vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
 
 
-	/* Check bS[I] is '1' or '2' */
-	vmax.u8 q1, q12, q14
-	vmax.u8 q0, q13, q15
+    /* Check bS[I] is '1' or '2' */
+    vmax.u8 q1, q12, q14
+    vmax.u8 q0, q13, q15
 
-	//vstm r5, {q0, q1}
+    //vstm r5, {q0, q1}
     vst1.32 {q0, q1}, [r5]
-	vpop {q4}
-	ldmia sp!, {r5-r7}
+    vpop {q4}
+    ldmia sp!, {r5-r7}
 WELS_ASM_FUNC_END
 #endif
--- a/codec/common/arm/expand_picture_neon.S
+++ b/codec/common/arm/expand_picture_neon.S
@@ -37,119 +37,119 @@
 
 WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
     stmdb sp!, {r4-r8}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
+    //Save the dst
+    mov r7, r0
+    mov r8, r3
 
-	add r4, r7, r2
-	sub r4, #1
+    add r4, r7, r2
+    sub r4, #1
     //For the left and right expand
 _expand_picture_luma_loop2:
-	sub r5, r7, #32
-	add r6, r4, #1
+    sub r5, r7, #32
+    add r6, r4, #1
 
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
+    vld1.8 {d0[], d1[]}, [r7], r1
+    vld1.8 {d2[], d3[]}, [r4], r1
 
-	vst1.8 {q0}, [r5]!
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]!
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_luma_loop2
+    vst1.8 {q0}, [r5]!
+    vst1.8 {q0}, [r5]
+    vst1.8 {q1}, [r6]!
+    vst1.8 {q1}, [r6]
+    subs r8, #1
+    bne _expand_picture_luma_loop2
 
-	//for the top and bottom expand
-	add r2, #64
-	sub r0, #32
-	mla r4, r1, r3, r0
-	sub r4, r1
+    //for the top and bottom expand
+    add r2, #64
+    sub r0, #32
+    mla r4, r1, r3, r0
+    sub r4, r1
 _expand_picture_luma_loop0:
-	mov r5, #32
+    mov r5, #32
     mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
+    add r6, r4, r1
+    vld1.8 {q0}, [r0]!
+    vld1.8 {q1}, [r4]!
 
-	mov r8, #32
+    mov r8, #32
 _expand_picture_luma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
+    vst1.8 {q0}, [r5], r1
+    vst1.8 {q1}, [r6], r1
+    subs r8, #1
     bne _expand_picture_luma_loop1
 
-	subs r2, #16
-	bne	_expand_picture_luma_loop0
+    subs r2, #16
+    bne _expand_picture_luma_loop0
 
     //vldreq.32 d0, [r0]
 
-	ldmia sp!, {r4-r8}
+    ldmia sp!, {r4-r8}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
     stmdb sp!, {r4-r9}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
+    //Save the dst
+    mov r7, r0
+    mov r8, r3
 
-	add r4, r7, r2
-	sub r4, #1
+    add r4, r7, r2
+    sub r4, #1
     //For the left and right expand
 _expand_picture_chroma_loop2:
-	sub r5, r7, #16
-	add r6, r4, #1
+    sub r5, r7, #16
+    add r6, r4, #1
 
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
+    vld1.8 {d0[], d1[]}, [r7], r1
+    vld1.8 {d2[], d3[]}, [r4], r1
 
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_chroma_loop2
+    vst1.8 {q0}, [r5]
+    vst1.8 {q1}, [r6]
+    subs r8, #1
+    bne _expand_picture_chroma_loop2
 
-	//for the top and bottom expand
-	add r2, #32
-        mov r9, r2
-        bic r2, #15
-	sub r0, #16
-	mla r4, r1, r3, r0
-	sub r4, r1
+    //for the top and bottom expand
+    add r2, #32
+    mov r9, r2
+    bic r2, #15
+    sub r0, #16
+    mla r4, r1, r3, r0
+    sub r4, r1
 _expand_picture_chroma_loop0:
-	mov r5, #16
-        mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
+    mov r5, #16
+    mls r5, r5, r1, r0
+    add r6, r4, r1
+    vld1.8 {q0}, [r0]!
+    vld1.8 {q1}, [r4]!
 
-	mov r8, #16
+    mov r8, #16
 _expand_picture_chroma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
-        bne _expand_picture_chroma_loop1
+    vst1.8 {q0}, [r5], r1
+    vst1.8 {q1}, [r6], r1
+    subs r8, #1
+    bne _expand_picture_chroma_loop1
 
-	subs r2, #16
-	bne	_expand_picture_chroma_loop0
+    subs r2, #16
+    bne _expand_picture_chroma_loop0
 
     //vldreq.32 d0, [r0]
 
-        and r9, #15
-        cmp r9, #8
-        bne _expand_picture_chroma_end
-	mov r5, #16
-        mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {d0}, [r0]!
-	vld1.8 {d2}, [r4]!
-	mov r8, #16
+    and r9, #15
+    cmp r9, #8
+    bne _expand_picture_chroma_end
+    mov r5, #16
+    mls r5, r5, r1, r0
+    add r6, r4, r1
+    vld1.8 {d0}, [r0]!
+    vld1.8 {d2}, [r4]!
+    mov r8, #16
 _expand_picture_chroma_loop3:
-	vst1.8 {d0}, [r5], r1
-	vst1.8 {d2}, [r6], r1
-	subs r8, #1
-        bne _expand_picture_chroma_loop3
+    vst1.8 {d0}, [r5], r1
+    vst1.8 {d2}, [r6], r1
+    subs r8, #1
+    bne _expand_picture_chroma_loop3
 _expand_picture_chroma_end:
 
-	ldmia sp!, {r4-r9}
+    ldmia sp!, {r4-r9}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -36,2175 +36,2175 @@
 
 #ifdef __APPLE__
 .macro AVERAGE_TWO_8BITS
-//	{	// input:dst_d, src_d A and B; working: q13
-    vaddl.u8	q13, $2, $1
-    vrshrn.u16		$0, q13, #1
-//	}
+//  {   // input:dst_d, src_d A and B; working: q13
+    vaddl.u8    q13, $2, $1
+    vrshrn.u16      $0, q13, #1
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-//	}
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        $6, q12, #5
+//  }
 .endm
 
-.macro FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
-    vrev64.8	$2, $0				// X[5][4][3][2][1][0]O
-    vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16	$0, $2, $1			// 0+1*[50]-5*[41]+20[32]
-    vpadd.s16	$0, $0, $0
-    vpadd.s16	$0, $0, $0
-    vqrshrun.s16	$0, $4, #5
-//	}
+.macro FILTER_SINGLE_TAG_8BITS      // when width=17/9, used
+//  {   // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+    vrev64.8    $2, $0              // X[5][4][3][2][1][0]O
+    vaddl.u8    $3, $0, $2          // each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16    $0, $2, $1          // 0+1*[50]-5*[41]+20[32]
+    vpadd.s16   $0, $0, $0
+    vpadd.s16   $0, $0, $0
+    vqrshrun.s16    $0, $4, #5
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-    vaddl.u8	q13, $2, $6
-    vrshrn.u16		$6, q13, #1
-//	}
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        $6, q12, #5
+    vaddl.u8    q13, $2, $6
+    vrshrn.u16      $6, q13, #1
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-    vaddl.u8	q13, $3, $6
-    vrshrn.u16		$6, q13, #1
-//	}
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        $6, q12, #5
+    vaddl.u8    q13, $3, $6
+    vrshrn.u16      $6, q13, #1
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_TO_16BITS
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+    vaddl.u8    $6, $0, $5      //dst_q=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
 .macro FILTER_3_IN_16BITS_TO_8BITS
-//	{	// input:a, b, c, dst_d;
-    vsub.s16	$0, $0, $1			//a-b
-    vshr.s16	$0, $0, #2			//(a-b)/4
-    vsub.s16	$0, $0, $1			//(a-b)/4-b
-    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	$3, $0, #6		//(+32)>>6
-//	}
+//  {   // input:a, b, c, dst_d;
+    vsub.s16    $0, $0, $1          //a-b
+    vshr.s16    $0, $0, #2          //(a-b)/4
+    vsub.s16    $0, $0, $1          //(a-b)/4-b
+    vadd.s16    $0, $0, $2          //(a-b)/4-b+c
+    vshr.s16    $0, $0, #2          //((a-b)/4-b+c)/4
+    vadd.s16    $0, $0, $2          //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    $3, $0, #6      //(+32)>>6
+//  }
 .endm
 
 .macro UNPACK_2_16BITS_TO_ABC
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16	$4, $0, $1, #2		//src[0]
-    vext.16	$3, $0, $1, #3		//src[1]
-    vadd.s16	$4, $3					//c=src[0]+src[1]
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    vext.16 $4, $0, $1, #2      //src[0]
+    vext.16 $3, $0, $1, #3      //src[1]
+    vadd.s16    $4, $3                  //c=src[0]+src[1]
 
-    vext.16	$3, $0, $1, #1		//src[-1]
-    vext.16	$2, $0, $1, #4		//src[2]
-    vadd.s16	$3, $2					//b=src[-1]+src[2]
+    vext.16 $3, $0, $1, #1      //src[-1]
+    vext.16 $2, $0, $1, #4      //src[2]
+    vadd.s16    $3, $2                  //b=src[-1]+src[2]
 
-    vext.16	$2, $0, $1, #5		//src[3]
-    vadd.s16	$2, $0					//a=src[-2]+src[3]
-//	}
+    vext.16 $2, $0, $1, #5      //src[3]
+    vadd.s16    $2, $0                  //a=src[-2]+src[3]
+//  }
 .endm
 
 .macro UNPACK_1_IN_8x16BITS_TO_8BITS
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],
-    vrev64.16	$1, $1
-    vadd.u16	$2, $1				// C[2+3],B[1+4],A[0+5],
-    vshr.s64	$1, $2, #16
-    vshr.s64	$0, $2, #32		// Output: C $2, B $1, A $0
+//  {   // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16 $3, $3, $3, #7  // 0x????, [0][1][2][3][4][5],
+    vrev64.16   $1, $1
+    vadd.u16    $2, $1              // C[2+3],B[1+4],A[0+5],
+    vshr.s64    $1, $2, #16
+    vshr.s64    $0, $2, #32     // Output: C $2, B $1, A $0
 
-    vsub.s16	$0, $0, $1			//a-b
-    vshr.s16	$0, $0, #2			//(a-b)/4
-    vsub.s16	$0, $0, $1			//(a-b)/4-b
-    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	$1, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	$0, $3, #6		//(+32)>>6
-//	}
+    vsub.s16    $0, $0, $1          //a-b
+    vshr.s16    $0, $0, #2          //(a-b)/4
+    vsub.s16    $0, $0, $1          //(a-b)/4-b
+    vadd.s16    $0, $0, $2          //(a-b)/4-b+c
+    vshr.s16    $0, $0, #2          //((a-b)/4-b+c)/4
+    vadd.s16    $1, $0, $2          //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    $0, $3, #6      //(+32)>>6
+//  }
 .endm
 #else
 .macro AVERAGE_TWO_8BITS arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: q13
-    vaddl.u8	q13, \arg2, \arg1
-    vrshrn.u16		\arg0, q13, #1
-//	}
+//  {   // input:dst_d, src_d A and B; working: q13
+    vaddl.u8    q13, \arg2, \arg1
+    vrshrn.u16      \arg0, q13, #1
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-//	}
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, \arg0, \arg5   //q12=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        \arg6, q12, #5
+//  }
 .endm
 
-.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
-    vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O
-    vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16	\arg0, \arg2, \arg1			// 0+1*[50]-5*[41]+20[32]
-    vpadd.s16	\arg0, \arg0, \arg0
-    vpadd.s16	\arg0, \arg0, \arg0
-    vqrshrun.s16	\arg0, \arg4, #5
-//	}
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5     // when width=17/9, used
+//  {   // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+    vrev64.8    \arg2, \arg0                // X[5][4][3][2][1][0]O
+    vaddl.u8    \arg3, \arg0, \arg2         // each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16    \arg0, \arg2, \arg1         // 0+1*[50]-5*[41]+20[32]
+    vpadd.s16   \arg0, \arg0, \arg0
+    vpadd.s16   \arg0, \arg0, \arg0
+    vqrshrun.s16    \arg0, \arg4, #5
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-    vaddl.u8	q13, \arg2, \arg6
-    vrshrn.u16		\arg6, q13, #1
-//	}
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, \arg0, \arg5   //q12=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        \arg6, q12, #5
+    vaddl.u8    q13, \arg2, \arg6
+    vrshrn.u16      \arg6, q13, #1
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-    vaddl.u8	q13, \arg3, \arg6
-    vrshrn.u16		\arg6, q13, #1
-//	}
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, \arg0, \arg5   //q12=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        \arg6, q12, #5
+    vaddl.u8    q13, \arg3, \arg6
+    vrshrn.u16      \arg6, q13, #1
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+    vaddl.u8    \arg6, \arg0, \arg5     //dst_q=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    \arg6, q13, \arg7   //dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    \arg6, q13, \arg8   //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
 .macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    vsub.s16	\arg0, \arg0, \arg1			//a-b
-    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
-//	}
+//  {   // input:a, b, c, dst_d;
+    vsub.s16    \arg0, \arg0, \arg1         //a-b
+    vshr.s16    \arg0, \arg0, #2            //(a-b)/4
+    vsub.s16    \arg0, \arg0, \arg1         //(a-b)/4-b
+    vadd.s16    \arg0, \arg0, \arg2         //(a-b)/4-b+c
+    vshr.s16    \arg0, \arg0, #2            //((a-b)/4-b+c)/4
+    vadd.s16    \arg0, \arg0, \arg2         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    \arg3, \arg0, #6        //(+32)>>6
+//  }
 .endm
 
 .macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16	\arg4, \arg0, \arg1, #2		//src[0]
-    vext.16	\arg3, \arg0, \arg1, #3		//src[1]
-    vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    vext.16 \arg4, \arg0, \arg1, #2     //src[0]
+    vext.16 \arg3, \arg0, \arg1, #3     //src[1]
+    vadd.s16    \arg4, \arg3                    //c=src[0]+src[1]
 
-    vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
-    vext.16	\arg2, \arg0, \arg1, #4		//src[2]
-    vadd.s16	\arg3,\arg2					//b=src[-1]+src[2]
+    vext.16 \arg3, \arg0, \arg1, #1     //src[-1]
+    vext.16 \arg2, \arg0, \arg1, #4     //src[2]
+    vadd.s16    \arg3,\arg2                 //b=src[-1]+src[2]
 
-    vext.16	\arg2, \arg0, \arg1, #5		//src[3]
-    vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
-//	}
+    vext.16 \arg2, \arg0, \arg1, #5     //src[3]
+    vadd.s16    \arg2, \arg0                    //a=src[-2]+src[3]
+//  }
 .endm
 
 .macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]
-    vrev64.16	\arg1, \arg1
-    vadd.u16	\arg2, \arg1				// C[2+3],B[1+4],A[0+5]
-    vshr.s64	\arg1, \arg2, #16
-    vshr.s64	\arg0, \arg2, #32		// Output: C \arg2, B \arg1, A \arg0
+//  {   // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+    vrev64.16   \arg1, \arg1
+    vadd.u16    \arg2, \arg1                // C[2+3],B[1+4],A[0+5]
+    vshr.s64    \arg1, \arg2, #16
+    vshr.s64    \arg0, \arg2, #32       // Output: C \arg2, B \arg1, A \arg0
 
-    vsub.s16	\arg0, \arg0, \arg1			//a-b
-    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	\arg1, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	\arg0, \arg3, #6		//(+32)>>6
-//	}
+    vsub.s16    \arg0, \arg0, \arg1         //a-b
+    vshr.s16    \arg0, \arg0, #2            //(a-b)/4
+    vsub.s16    \arg0, \arg0, \arg1         //(a-b)/4-b
+    vadd.s16    \arg0, \arg0, \arg2         //(a-b)/4-b+c
+    vshr.s16    \arg0, \arg0, #2            //((a-b)/4-b+c)/4
+    vadd.s16    \arg1, \arg0, \arg2         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    \arg0, \arg3, #6        //(+32)>>6
+//  }
 .endm
 #endif
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w16_h_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 21(16+5); q0=src[-2]
+    pld         [r0]
+    pld         [r0, #16]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+    FILTER_6TAG_8BITS   d0, d4, d6, d16, d18, d20, d2, q14, q15
 
-	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+    FILTER_6TAG_8BITS   d1, d5, d7, d17, d19, d21, d3, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+    sub     r4, #1
+    vst1.u8 {d2, d3}, [r2], r3      //write 16Byte
 
-	cmp		r4, #0
-	bne		w16_h_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w16_h_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w8_h_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 13(8+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+    FILTER_6TAG_8BITS   d0, d2, d3, d4, d5, d6, d1, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
+    sub     r4, #1
+    vst1.u8 {d1}, [r2], r3
 
-	cmp		r4, #0
-	bne		w8_h_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w8_h_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r6, [sp, #12]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w4_h_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
+    vld1.u8 {d0, d1}, [r0], r1  //only use 9(4+5);d0: 1st row src[-2:5]
+    pld         [r0]
+    vld1.u8 {d2, d3}, [r0], r1  //d2: 2nd row src[-2:5]
+    pld         [r0]
 
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+    vext.8      d4, d0, d1, #1      //d4: 1st row src[-1:6]
+    vext.8      d5, d2, d3, #1      //d5: 2nd row src[-1:6]
+    vext.8      q3, q2, q2, #1      //src[0:6 *]
+    vext.8      q8, q2, q2, #2      //src[1:6 * *]
 
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+    vtrn.32 q3, q8                  //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+    vtrn.32 d6, d7                  //d6:[0:3]; d7[1:4]
+    vtrn.32     d0, d2              //d0:[-2:1]; d2[2:5]
+    vtrn.32     d4, d5              //d4:[-1:2]; d5[3:6]
 
-	FILTER_6TAG_8BITS 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+    FILTER_6TAG_8BITS   d0, d4, d6, d7, d2, d5, d1, q14, q15
 
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vmov        r4, r5, d1
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_h_mc_luma_loop
+    sub     r6, #2
+    cmp     r6, #0
+    bne     w4_h_mc_luma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w16_xy_10_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 21(16+5); q0=src[-2]
+    pld         [r0]
+    pld         [r0, #16]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d4, d6, d16, d18, d20, d2, q14, q15
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d5, d7, d17, d19, d21, d3, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+    sub     r4, #1
+    vst1.u8 {d2, d3}, [r2], r3      //write 16Byte
 
-	cmp		r4, #0
-	bne		w16_xy_10_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w16_xy_10_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w8_xy_10_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 13(8+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d2, d3, d4, d5, d6, d1, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
+    sub     r4, #1
+    vst1.u8 {d1}, [r2], r3
 
-	cmp		r4, #0
-	bne		w8_xy_10_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w8_xy_10_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r6, [sp, #12]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w4_xy_10_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
+    vld1.u8 {d0, d1}, [r0], r1  //only use 9(4+5);d0: 1st row src[-2:5]
+    pld         [r0]
+    vld1.u8 {d2, d3}, [r0], r1  //d2: 2nd row src[-2:5]
+    pld         [r0]
 
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+    vext.8      d4, d0, d1, #1      //d4: 1st row src[-1:6]
+    vext.8      d5, d2, d3, #1      //d5: 2nd row src[-1:6]
+    vext.8      q3, q2, q2, #1      //src[0:6 *]
+    vext.8      q8, q2, q2, #2      //src[1:6 * *]
 
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+    vtrn.32 q3, q8                  //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+    vtrn.32 d6, d7                  //d6:[0:3]; d7[1:4]
+    vtrn.32     d0, d2              //d0:[-2:1]; d2[2:5]
+    vtrn.32     d4, d5              //d4:[-1:2]; d5[3:6]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d4, d6, d7, d2, d5, d1, q14, q15
 
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vmov        r4, r5, d1
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_10_mc_luma_loop
+    sub     r6, #2
+    cmp     r6, #0
+    bne     w4_xy_10_mc_luma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w16_xy_30_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 21(16+5); q0=src[-2]
+    pld         [r0]
+    pld         [r0, #16]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d4, d6, d16, d18, d20, d2, q14, q15
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d5, d7, d17, d19, d21, d3, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+    sub     r4, #1
+    vst1.u8 {d2, d3}, [r2], r3      //write 16Byte
 
-	cmp		r4, #0
-	bne		w16_xy_30_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w16_xy_30_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w8_xy_30_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 13(8+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d2, d3, d4, d5, d6, d1, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
+    sub     r4, #1
+    vst1.u8 {d1}, [r2], r3
 
-	cmp		r4, #0
-	bne		w8_xy_30_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w8_xy_30_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r6, [sp, #12]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w4_xy_30_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
+    vld1.u8 {d0, d1}, [r0], r1  //only use 9(4+5);d0: 1st row src[-2:5]
+    pld         [r0]
+    vld1.u8 {d2, d3}, [r0], r1  //d2: 2nd row src[-2:5]
+    pld         [r0]
 
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+    vext.8      d4, d0, d1, #1      //d4: 1st row src[-1:6]
+    vext.8      d5, d2, d3, #1      //d5: 2nd row src[-1:6]
+    vext.8      q3, q2, q2, #1      //src[0:6 *]
+    vext.8      q8, q2, q2, #2      //src[1:6 * *]
 
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+    vtrn.32 q3, q8                  //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+    vtrn.32 d6, d7                  //d6:[0:3]; d7[1:4]
+    vtrn.32     d0, d2              //d0:[-2:1]; d2[2:5]
+    vtrn.32     d4, d5              //d4:[-1:2]; d5[3:6]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d4, d6, d7, d2, d5, d1, q14, q15
 
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vmov        r4, r5, d1
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_30_mc_luma_loop
+    sub     r6, #2
+    cmp     r6, #0
+    bne     w4_xy_30_mc_luma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w16_xy_01_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q4
+    //q2, q3, q4, q5, q0 --> q0~q4
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q4
 
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_xy_01_luma_loop
-	pop		{r4}
+    sub     r4, #8
+    cmp     r4, #0
+    bne     w16_xy_01_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w8_xy_01_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_xy_01_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_xy_01_mc_luma_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
+    push        {r4, r5, r6, r7}
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    ldr     r4, [r0], r1        //r4=src[-2]
+    ldr     r5, [r0], r1        //r5=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    ldr     r6, [r0], r1        //r6=src[0]
+    ldr     r7, [r0], r1        //r7=src[1]
 
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
+    vmov        d0, r4, r5
+    vmov        d1, r5, r6
+    vmov        d2, r6, r7
 
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d3, r7, r4
+    ldr         r7, [sp, #16]
 
 w4_xy_01_mc_luma_loop:
 
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
+//  pld         [r0]
+    //using reserving r4
+    ldr     r5, [r0], r1        //r5=src[3]
+    ldr     r6, [r0], r1        //r6=src[0]
+    vmov        d4, r4, r5
+    vmov        d5, r5, r6          //reserved r6
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vmov        r4, r5, d16
+    str r4, [r2], r3            //write 1st 4Byte
+    str r5, [r2], r3            //write 2nd 4Byte
 
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
+    ldr     r5, [r0], r1        //r5=src[1]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d0, r6, r5
+    vmov        d1, r5, r4          //reserved r4
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vmov        r5, r6, d16
+    str r5, [r2], r3            //write 3rd 4Byte
+    str r6, [r2], r3            //write 4th 4Byte
 
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
+    //d4, d5, d0, d1 --> d0, d1, d2, d3
+    vmov    q1, q0
+    vmov    q0, q2
 
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_xy_01_mc_luma_loop
+    sub     r7, #4
+    cmp     r7, #0
+    bne     w4_xy_01_mc_luma_loop
 
-	pop		{r4, r5, r6, r7}
+    pop     {r4, r5, r6, r7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w16_xy_03_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
+    //q2, q3, q8, q9, q0 --> q0~q8
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q8
 
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_xy_03_luma_loop
-	pop		{r4}
+    sub     r4, #8
+    cmp     r4, #0
+    bne     w16_xy_03_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w8_xy_03_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_xy_03_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_xy_03_mc_luma_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
+    push        {r4, r5, r6, r7}
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    ldr     r4, [r0], r1        //r4=src[-2]
+    ldr     r5, [r0], r1        //r5=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    ldr     r6, [r0], r1        //r6=src[0]
+    ldr     r7, [r0], r1        //r7=src[1]
 
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
+    vmov        d0, r4, r5
+    vmov        d1, r5, r6
+    vmov        d2, r6, r7
 
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d3, r7, r4
+    ldr         r7, [sp, #16]
 
 w4_xy_03_mc_luma_loop:
 
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
+//  pld         [r0]
+    //using reserving r4
+    ldr     r5, [r0], r1        //r5=src[3]
+    ldr     r6, [r0], r1        //r6=src[0]
+    vmov        d4, r4, r5
+    vmov        d5, r5, r6          //reserved r6
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vmov        r4, r5, d16
+    str r4, [r2], r3            //write 1st 4Byte
+    str r5, [r2], r3            //write 2nd 4Byte
 
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
+    ldr     r5, [r0], r1        //r5=src[1]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d0, r6, r5
+    vmov        d1, r5, r4          //reserved r4
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vmov        r5, r6, d16
+    str r5, [r2], r3            //write 3rd 4Byte
+    str r6, [r2], r3            //write 4th 4Byte
 
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
+    //d4, d5, d0, d1 --> d0, d1, d2, d3
+    vmov    q1, q0
+    vmov    q0, q2
 
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_xy_03_mc_luma_loop
+    sub     r7, #4
+    cmp     r7, #0
+    bne     w4_xy_03_mc_luma_loop
 
-	pop		{r4, r5, r6, r7}
+    pop     {r4, r5, r6, r7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w16_v_mc_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS   d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS   d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS   d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS   d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
+    //q2, q3, q8, q9, q0 --> q0~q8
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q8
 
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_v_mc_luma_loop
-	pop		{r4}
+    sub     r4, #8
+    cmp     r4, #0
+    bne     w16_v_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w8_v_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_v_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_v_mc_luma_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
+    push        {r4, r5, r6, r7}
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    ldr     r4, [r0], r1        //r4=src[-2]
+    ldr     r5, [r0], r1        //r5=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    ldr     r6, [r0], r1        //r6=src[0]
+    ldr     r7, [r0], r1        //r7=src[1]
 
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
+    vmov        d0, r4, r5
+    vmov        d1, r5, r6
+    vmov        d2, r6, r7
 
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d3, r7, r4
+    ldr         r7, [sp, #16]
 
 w4_v_mc_luma_loop:
 
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
+//  pld         [r0]
+    //using reserving r4
+    ldr     r5, [r0], r1        //r5=src[3]
+    ldr     r6, [r0], r1        //r6=src[0]
+    vmov        d4, r4, r5
+    vmov        d5, r5, r6          //reserved r6
 
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vmov        r4, r5, d16
+    str r4, [r2], r3            //write 1st 4Byte
+    str r5, [r2], r3            //write 2nd 4Byte
 
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
+    ldr     r5, [r0], r1        //r5=src[1]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d0, r6, r5
+    vmov        d1, r5, r4          //reserved r4
 
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
+    FILTER_6TAG_8BITS   d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vmov        r5, r6, d16
+    str r5, [r2], r3            //write 3rd 4Byte
+    str r6, [r2], r3            //write 4th 4Byte
 
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
+    //d4, d5, d0, d1 --> d0, d1, d2, d3
+    vmov    q1, q0
+    vmov    q0, q2
 
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_v_mc_luma_loop
+    sub     r7, #4
+    cmp     r7, #0
+    bne     w4_v_mc_luma_loop
 
-	pop		{r4, r5, r6, r7}
+    pop     {r4, r5, r6, r7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
-	push		{r4}
-	vpush		{q4-q7}
-	ldr			r4, [sp, #68]
+    push        {r4}
+    vpush       {q4-q7}
+    ldr         r4, [sp, #68]
 
-	sub			r0, #2					//src[-2]
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2                  //src[-2]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0-d2}, [r0], r1       //use 21(16+5), =src[-2]
+    vld1.u8 {d3-d5}, [r0], r1       //use 21(16+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
 
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(16+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(16+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(16+5), =src[2]
+    vld1.u8 {d6-d8}, [r0], r1       //use 21(16+5), =src[0]
+    vld1.u8 {d9-d11}, [r0], r1  //use 21(16+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
 
 w16_hv_mc_luma_loop:
 
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(16+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+    vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d3, d6, d9, d12, d15, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d4, d7,d10, d13, d16,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0   //output to q0[0]
 
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2], r3		//write 16Byte
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d5, d8,d11, d14, d17,q11, q14, q15  // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1    //output to q0[1]
+    vst1.u8 {q0}, [r2], r3      //write 16Byte
 
 
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
+    vld1.u8 {d0-d2}, [r0], r1       //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d6, d9, d12, d15, d0, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d7,d10, d13, d16, d1,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3   //output to d3
 
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d8,d11, d14, d17, d2,q11, q14, q15  // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4    //output to d4
 
-	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte
+    vst1.u8 {d3, d4}, [r2], r3      //write 16Byte
 
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
+    vld1.u8 {d3-d5}, [r0], r1       //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d9, d12, d15, d0, d3, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7,d10, d13, d16, d1, d4,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6   //output to d6
 
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d8,d11, d14, d17, d2, d5,q11, q14, q15  // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7    //output to d7
+    vst1.u8 {d6, d7}, [r2], r3      //write 16Byte
 
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
+    vld1.u8 {d6-d8}, [r0], r1       //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS      d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9   //output to d9
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10   //output to d10
+    vst1.u8 {d9, d10}, [r2], r3     //write 16Byte
 
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2
-	vmov	q2, q8
+    //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+    vswp    q0, q6
+    vswp    q6, q3
+    vmov    q5, q2
+    vmov    q2, q8
 
-	vmov	d20,d8
-	vmov	q4, q1
-	vmov	q1, q7
-	vmov	d14,d20
+    vmov    d20,d8
+    vmov    q4, q1
+    vmov    q1, q7
+    vmov    d14,d20
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w16_hv_mc_luma_loop
-	vpop		{q4-q7}
-	pop		{r4}
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w16_hv_mc_luma_loop
+    vpop        {q4-q7}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
-	push		{r4}
-	vpush		{q4}
-	ldr			r4, [sp, #20]
+    push        {r4}
+    vpush       {q4}
+    ldr         r4, [sp, #20]
 
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2              //src[-2]
+    sub         r0, r0, r1, lsl #1  //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 13(8+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 13(8+5), =src[-1]
+    vmov.u16    q14, #0x0014        // 20
+    vld1.u8 {q0}, [r0], r1  //use 13(8+5), =src[-2]
+    vld1.u8 {q1}, [r0], r1  //use 13(8+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2        // 5
 
-	vld1.u8	{q2}, [r0], r1	//use 13(8+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 13(8+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 13(8+5), =src[2]
+    vld1.u8 {q2}, [r0], r1  //use 13(8+5), =src[0]
+    vld1.u8 {q3}, [r0], r1  //use 13(8+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {q4}, [r0], r1  //use 13(8+5), =src[2]
 
 w8_hv_mc_luma_loop:
 
-	vld1.u8	{q8}, [r0], r1	//use 13(8+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
+    vld1.u8 {q8}, [r0], r1  //use 13(8+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d16, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d17, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3           //write 8Byte
 
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3		//write 8Byte
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d4, d6, d8, d16, d0, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d5, d7, d9, d17, d1, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3       //write 8Byte
 
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d6, d8, d16, d0, d2, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d7, d9, d17, d1, d3, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3           //write 8Byte
 
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d8, d16, d0, d2, d4, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7, d9, d17, d1, d3, d5, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3           //write 8Byte
 
-	//q4~q5, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1
-	vmov	q1, q8
+    //q4~q5, q0~q2, --> q0~q4
+    vswp    q0, q4
+    vswp    q2, q4
+    vmov    q3, q1
+    vmov    q1, q8
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_hv_mc_luma_loop
-	vpop		{q4}
-	pop		{r4}
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_hv_mc_luma_loop
+    vpop        {q4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
-	push		{r4 ,r5, r6}
-	vpush		{q4-q7}
-	ldr			r6, [sp, #76]
+    push        {r4 ,r5, r6}
+    vpush       {q4-q7}
+    ldr         r6, [sp, #76]
 
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2              //src[-2]
+    sub         r0, r0, r1, lsl #1  //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[-1]
+    vmov.u16    q14, #0x0014        // 20
+    vld1.u8 {q0}, [r0], r1  //use 9(4+5), =src[-2]
+    vld1.u8 {q1}, [r0], r1  //use 9(4+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2        // 5
 
-	vld1.u8	{q2}, [r0], r1	//use 9(4+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 9(4+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 9(4+5), =src[2]
+    vld1.u8 {q2}, [r0], r1  //use 9(4+5), =src[0]
+    vld1.u8 {q3}, [r0], r1  //use 9(4+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {q4}, [r0], r1  //use 9(4+5), =src[2]
 
 w4_hv_mc_luma_loop:
 
-	vld1.u8	{q5}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q6}, [r0], r1	//use 9(4+5), =src[4]
+    vld1.u8 {q5}, [r0], r1  //use 9(4+5), =src[3]
+    vld1.u8 {q6}, [r0], r1  //use 9(4+5), =src[4]
 
-	//the 1st&2nd row
-	pld			[r0]
-	pld			[r0, r1]
-	// vertical filtered
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q8, q14, q15	// 1 avail
+    //the 1st&2nd row
+    pld         [r0]
+    pld         [r0, r1]
+    // vertical filtered
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d10, q7, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d11, q8, q14, q15   // 1 avail
 
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8,d10, d12, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9,d11, d13,q10, q14, q15	// 1 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q0, q7, q8		//4 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d4, d6, d8,d10, d12, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d5, d7, d9,d11, d13,q10, q14, q15   // 1 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q7, q8, q11, q12, q13   //4 avail
+    UNPACK_2_16BITS_TO_ABC  q9,q10, q0, q7, q8      //4 avail
 
-	vmov	d23, d0
-	vmov	d25, d14
-	vmov	d27, d16
+    vmov    d23, d0
+    vmov    d25, d14
+    vmov    d27, d16
 
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22  //output to q11[0]
+    vmov        r4 ,r5, d22
+    str     r4, [r2], r3                //write 4Byte
+    str     r5, [r2], r3                //write 4Byte
 
-	//the 3rd&4th row
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[4]
-	pld			[r0]
-	pld			[r0, r1]
-	// vertical filtered
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d12, d0, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d13, d1, q8, q14, q15	// 1 avail
+    //the 3rd&4th row
+    vld1.u8 {q0}, [r0], r1  //use 9(4+5), =src[3]
+    vld1.u8 {q1}, [r0], r1  //use 9(4+5), =src[4]
+    pld         [r0]
+    pld         [r0, r1]
+    // vertical filtered
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d6, d8, d10, d12, d0, q7, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d7, d9, d11, d13, d1, q8, q14, q15  // 1 avail
 
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8,d10, d12, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9,d11, d13, d1, d3,q10, q14, q15	// 1 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q2, q7, q8		//4 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d8,d10, d12, d0, d2, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7, d9,d11, d13, d1, d3,q10, q14, q15   // 1 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q7, q8, q11, q12, q13   //4 avail
+    UNPACK_2_16BITS_TO_ABC  q9,q10, q2, q7, q8      //4 avail
 
-	vmov	d23, d4
-	vmov	d25, d14
-	vmov	d27, d16
+    vmov    d23, d4
+    vmov    d25, d14
+    vmov    d27, d16
 
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22  //output to q11[0]
+    vmov        r4 ,r5, d22
+    str     r4, [r2], r3                //write 4Byte
+    str     r5, [r2], r3                //write 4Byte
 
-	//q4~q6, q0~q1, --> q0~q4
-	vswp	q4, q0
-	vmov	q3, q4
-	vmov	q4, q1
-	vmov	q1, q5
-	vmov	q2, q6
+    //q4~q6, q0~q1, --> q0~q4
+    vswp    q4, q0
+    vmov    q3, q4
+    vmov    q4, q1
+    vmov    q1, q5
+    vmov    q2, q6
 
-	sub		r6, #4
-	cmp		r6, #0
-	bne		w4_hv_mc_luma_loop
+    sub     r6, #4
+    cmp     r6, #0
+    bne     w4_hv_mc_luma_loop
 
-	vpop		{q4-q7}
-	pop		{r4, r5, r6}
+    vpop        {q4-q7}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 w16_copy_loop:
-	vld1.u8		{q0}, [r0], r1
-	sub			r4, #2
-	vld1.u8		{q1}, [r0], r1
-	vst1.u8		{q0}, [r2], r3
-	cmp			r4, #0
-	vst1.u8		{q1}, [r2], r3
-	bne			w16_copy_loop
+    vld1.u8     {q0}, [r0], r1
+    sub         r4, #2
+    vld1.u8     {q1}, [r0], r1
+    vst1.u8     {q0}, [r2], r3
+    cmp         r4, #0
+    vst1.u8     {q1}, [r2], r3
+    bne         w16_copy_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 w8_copy_loop:
-	vld1.u8		{d0}, [r0], r1
-	vld1.u8		{d1}, [r0], r1
-	vst1.u8		{d0}, [r2], r3
-	vst1.u8		{d1}, [r2], r3
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w8_copy_loop
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vst1.u8     {d1}, [r2], r3
+    sub         r4, #2
+    cmp         r4, #0
+    bne         w8_copy_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
 w4_copy_loop:
-	ldr		r5, [r0], r1
-	ldr		r6, [r0], r1
-	str		r5, [r2], r3
-	str		r6, [r2], r3
+    ldr     r5, [r0], r1
+    ldr     r6, [r0], r1
+    str     r5, [r2], r3
+    str     r6, [r2], r3
 
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_copy_loop
+    sub         r4, #2
+    cmp         r4, #0
+    bne         w4_copy_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2]!
-	vld1.u8		{q1}, [r3]!
-	vld1.u8		{q2}, [r2]!
-	vld1.u8		{q3}, [r3]!
+    vld1.u8     {q0}, [r2]!
+    vld1.u8     {q1}, [r3]!
+    vld1.u8     {q2}, [r2]!
+    vld1.u8     {q3}, [r3]!
 
-	vld1.u8		{q8}, [r2]!
-	vld1.u8		{q9}, [r3]!
-	vld1.u8		{q10}, [r2]!
-	vld1.u8		{q11}, [r3]!
+    vld1.u8     {q8}, [r2]!
+    vld1.u8     {q9}, [r3]!
+    vld1.u8     {q10}, [r2]!
+    vld1.u8     {q11}, [r3]!
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {q0}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {q2}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d16, d16, d18
-	AVERAGE_TWO_8BITS		d17, d17, d19
-	vst1.u8		{q8}, [r0], r1
+    AVERAGE_TWO_8BITS       d16, d16, d18
+    AVERAGE_TWO_8BITS       d17, d17, d19
+    vst1.u8     {q8}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d20, d20, d22
-	AVERAGE_TWO_8BITS		d21, d21, d23
-	vst1.u8		{q10}, [r0], r1
+    AVERAGE_TWO_8BITS       d20, d20, d22
+    AVERAGE_TWO_8BITS       d21, d21, d23
+    vst1.u8     {q10}, [r0], r1
 
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w16_pix_avg_loop
+    sub         r4, #4
+    cmp         r4, #0
+    bne         w16_pix_avg_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	mov			r5, #16
+    push        {r4, r5}
+    ldr         r4, [sp, #8]
+    mov         r5, #16
 w8_pix_avg_loop:
 
-	vld1.u8		{d0}, [r2], r5
-	vld1.u8		{d2}, [r3], r5
-	vld1.u8		{d1}, [r2], r5
-	vld1.u8		{d3}, [r3], r5
+    vld1.u8     {d0}, [r2], r5
+    vld1.u8     {d2}, [r3], r5
+    vld1.u8     {d1}, [r2], r5
+    vld1.u8     {d3}, [r3], r5
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {d0}, [r0], r1
+    vst1.u8     {d1}, [r0], r1
 
-	vld1.u8		{d4}, [r2], r5
-	vld1.u8		{d6}, [r3], r5
-	vld1.u8		{d5}, [r2], r5
-	vld1.u8		{d7}, [r3], r5
+    vld1.u8     {d4}, [r2], r5
+    vld1.u8     {d6}, [r3], r5
+    vld1.u8     {d5}, [r2], r5
+    vld1.u8     {d7}, [r3], r5
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {d4}, [r0], r1
+    vst1.u8     {d5}, [r0], r1
 
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w8_pix_avg_loop
+    sub         r4, #4
+    cmp         r4, #0
+    bne         w8_pix_avg_loop
 
-	pop		{r4, r5}
+    pop     {r4, r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
-	push		{r4-r8}
-	ldr			r4, [sp, #20]
+    push        {r4-r8}
+    ldr         r4, [sp, #20]
 w4_pix_avg_loop:
 
-	ldr		r5, [r2]
-	ldr		r6, [r2, #16]
-	ldr		r7, [r3]
-	ldr		r8, [r3, #16]
-	add		r2, #32
-	add		r3, #32
+    ldr     r5, [r2]
+    ldr     r6, [r2, #16]
+    ldr     r7, [r3]
+    ldr     r8, [r3, #16]
+    add     r2, #32
+    add     r3, #32
 
-	vmov		d0, r5, r6
-	vmov		d1, r7, r8
-	AVERAGE_TWO_8BITS		d0, d0, d1
-	vmov		r5, r6, d0
+    vmov        d0, r5, r6
+    vmov        d1, r7, r8
+    AVERAGE_TWO_8BITS       d0, d0, d1
+    vmov        r5, r6, d0
 
-	str		r5, [r0], r1
-	str		r6, [r0], r1
+    str     r5, [r0], r1
+    str     r6, [r0], r1
 
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_pix_avg_loop
+    sub         r4, #2
+    cmp         r4, #0
+    bne         w4_pix_avg_loop
 
-	pop		{r4-r8}
+    pop     {r4-r8}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	ldr			r5, [sp, #12]
-//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-//	we can opti it by adding vert only/ hori only cases, to be continue
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
-	vld1.u8		{q0}, [r0], r1	//src[x]
+    push        {r4, r5}
+    ldr         r4, [sp, #8]
+    ldr         r5, [sp, #12]
+//  normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+//  we can opti it by adding vert only/ hori only cases, to be continue
+    vld1.u8 {d31}, [r4]     //load A/B/C/D
+    vld1.u8     {q0}, [r0], r1  //src[x]
 
-	vdup.u8	d28, d31[0]			//A
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C
-	vdup.u8	d31, d31[3]			//D
+    vdup.u8 d28, d31[0]         //A
+    vdup.u8 d29, d31[1]         //B
+    vdup.u8 d30, d31[2]         //C
+    vdup.u8 d31, d31[3]         //D
 
-	vext.u8		d1, d0, d1, #1		//src[x+1]
+    vext.u8     d1, d0, d1, #1      //src[x+1]
 
-w8_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{q1}, [r0], r1	//src[x+stride]
-	vld1.u8		{q2}, [r0], r1	//src[x+2*stride]
-	vext.u8		d3, d2, d3, #1		//src[x+stride+1]
-	vext.u8		d5, d4, d5, #1		//src[x+2*stride+1]
+w8_mc_chroma_loop:  // each two pxl row
+    vld1.u8     {q1}, [r0], r1  //src[x+stride]
+    vld1.u8     {q2}, [r0], r1  //src[x+2*stride]
+    vext.u8     d3, d2, d3, #1      //src[x+stride+1]
+    vext.u8     d5, d4, d5, #1      //src[x+2*stride+1]
 
-	vmull.u8		q3, d0, d28			//(src[x] * A)
-	vmlal.u8		q3, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d2, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d3, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
+    vmull.u8        q3, d0, d28         //(src[x] * A)
+    vmlal.u8        q3, d1, d29         //+=(src[x+1] * B)
+    vmlal.u8        q3, d2, d30         //+=(src[x+stride] * C)
+    vmlal.u8        q3, d3, d31         //+=(src[x+stride+1] * D)
+    vrshrn.u16      d6, q3, #6
+    vst1.u8 d6, [r2], r3
 
-	vmull.u8		q3, d2, d28			//(src[x] * A)
-	vmlal.u8		q3, d3, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d4, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d5, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
+    vmull.u8        q3, d2, d28         //(src[x] * A)
+    vmlal.u8        q3, d3, d29         //+=(src[x+1] * B)
+    vmlal.u8        q3, d4, d30         //+=(src[x+stride] * C)
+    vmlal.u8        q3, d5, d31         //+=(src[x+stride+1] * D)
+    vrshrn.u16      d6, q3, #6
+    vst1.u8 d6, [r2], r3
 
-	vmov		q0, q2
-	sub			r5, #2
-	cmp			r5, #0
-	bne			w8_mc_chroma_loop
+    vmov        q0, q2
+    sub         r5, #2
+    cmp         r5, #0
+    bne         w8_mc_chroma_loop
 
-	pop		{r4, r5}
+    pop     {r4, r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
 
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r6, [sp, #16]
-//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-//	we can opti it by adding vert only/ hori only cases, to be continue
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
+    ldr         r6, [sp, #16]
+//  normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+//  we can opti it by adding vert only/ hori only cases, to be continue
+    vld1.u8 {d31}, [r4]     //load A/B/C/D
 
-	vdup.u8	d28, d31[0]			//A
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C
-	vdup.u8	d31, d31[3]			//D
+    vdup.u8 d28, d31[0]         //A
+    vdup.u8 d29, d31[1]         //B
+    vdup.u8 d30, d31[2]         //C
+    vdup.u8 d31, d31[3]         //D
 
-w4_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{d0}, [r0], r1	//a::src[x]
-	vld1.u8		{d2}, [r0], r1	//b::src[x+stride]
-	vld1.u8		{d4}, [r0]			//c::src[x+2*stride]
+w4_mc_chroma_loop:  // each two pxl row
+    vld1.u8     {d0}, [r0], r1  //a::src[x]
+    vld1.u8     {d2}, [r0], r1  //b::src[x+stride]
+    vld1.u8     {d4}, [r0]          //c::src[x+2*stride]
 
-	vshr.u64		d1, d0, #8
-	vshr.u64		d3, d2, #8
-	vshr.u64		d5, d4, #8
+    vshr.u64        d1, d0, #8
+    vshr.u64        d3, d2, #8
+    vshr.u64        d5, d4, #8
 
-	vmov			q3, q1				//b::[0:7]+b::[1~8]
-	vtrn.32		q0, q1				//d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
-	vtrn.32		q3, q2				//d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+    vmov            q3, q1              //b::[0:7]+b::[1~8]
+    vtrn.32     q0, q1              //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+    vtrn.32     q3, q2              //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
 
-	vmull.u8		q1, d0, d28			//(src[x] * A)
-	vmlal.u8		q1, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q1, d6, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q1, d7, d31			//+=(src[x+stride+1] * D)
+    vmull.u8        q1, d0, d28         //(src[x] * A)
+    vmlal.u8        q1, d1, d29         //+=(src[x+1] * B)
+    vmlal.u8        q1, d6, d30         //+=(src[x+stride] * C)
+    vmlal.u8        q1, d7, d31         //+=(src[x+stride+1] * D)
 
-	vrshrn.u16		d2, q1, #6
-	vmov		r4, r5, d2
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vrshrn.u16      d2, q1, #6
+    vmov        r4, r5, d2
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub			r6, #2
-	cmp			r6, #0
-	bne			w4_mc_chroma_loop
+    sub         r6, #2
+    cmp         r6, #0
+    bne         w4_mc_chroma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d3, r5, r4					// 0x0014FFFB00010000
+    push        {r4-r5}
+    mov         r4, #20
+    mov         r5, #1
+    sub         r4, r4, r4, lsl #(16-2)
+    lsl         r5, #16
+    ror         r4, #16
+    vmov        d3, r5, r4                  // 0x0014FFFB00010000
 
-	sub			r3, #16
-	ldr			r4, [sp, #8]
+    sub         r3, #16
+    ldr         r4, [sp, #8]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w17_h_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 22(17+5); q0=src[-2]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 22(17+5); q0=src[-2]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d22, q14, q15
+    FILTER_6TAG_8BITS   d0, d4, d6, d16, d18, d20, d22, q14, q15
 
-	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d23, q14, q15
+    FILTER_6TAG_8BITS   d1, d5, d7, d17, d19, d21, d23, q14, q15
 
-	vst1.u8	{d22, d23}, [r2]!		//write [0:15] Byte
+    vst1.u8 {d22, d23}, [r2]!       //write [0:15] Byte
 
-	vsli.64	d2, d2, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d3, d22, q11, q1
+    vsli.64 d2, d2, #8              // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+    FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
 
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+    vst1.u8 {d2[0]}, [r2], r3       //write 16th Byte
 
-	sub		r4, #1
-	cmp		r4, #0
-	bne		w17_h_mc_luma_loop
-	pop		{r4-r5}
+    sub     r4, #1
+    cmp     r4, #0
+    bne     w17_h_mc_luma_loop
+    pop     {r4-r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d7, r5, r4					// 0x0014FFFB00010000
+    push        {r4-r5}
+    mov         r4, #20
+    mov         r5, #1
+    sub         r4, r4, r4, lsl #(16-2)
+    lsl         r5, #16
+    ror         r4, #16
+    vmov        d7, r5, r4                  // 0x0014FFFB00010000
 
-	sub			r3, #8
-	ldr			r4, [sp, #8]
+    sub         r3, #8
+    ldr         r4, [sp, #8]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w9_h_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 14(9+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 14(9+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d16, q14, q15
+    FILTER_6TAG_8BITS   d0, d2, d3, d4, d5, d6, d16, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d16}, [r2]!		//write [0:7] Byte
+    sub     r4, #1
+    vst1.u8 {d16}, [r2]!        //write [0:7] Byte
 
-	vsli.64	d2, d1, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d7, d18, q9, q1
-	vst1.u8	{d2[0]}, [r2], r3		//write 8th Byte
+    vsli.64 d2, d1, #8              // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+    FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
+    vst1.u8 {d2[0]}, [r2], r3       //write 8th Byte
 
-	cmp		r4, #0
-	bne		w9_h_mc_luma_loop
-	pop		{r4-r5}
+    cmp     r4, #0
+    bne     w9_h_mc_luma_loop
+    pop     {r4-r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w17_v_mc_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS   d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS   d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS   d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS   d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
+    //q2, q3, q8, q9, q0 --> q0~q8
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q8
 
-	sub		r4, #8
-	cmp		r4, #1
-	bne		w17_v_mc_luma_loop
-	// the last 16Bytes
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    sub     r4, #8
+    cmp     r4, #1
+    bne     w17_v_mc_luma_loop
+    // the last 16Bytes
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w9_v_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w9_v_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #1
+    bne     w9_v_mc_luma_loop
 
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vst1.u8	{d16}, [r2], r3		//write last 8Byte
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vst1.u8 {d16}, [r2], r3     //write last 8Byte
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
-	push		{r4}
-	vpush		{q4-q7}
-	ldr			r4, [sp, #68]
+    push        {r4}
+    vpush       {q4-q7}
+    ldr         r4, [sp, #68]
 
-	sub			r0, #2					//src[-2]
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2                  //src[-2]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(17+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(17+5), =src[-1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0-d2}, [r0], r1       //use 21(17+5), =src[-2]
+    vld1.u8 {d3-d5}, [r0], r1       //use 21(17+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
 
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(17+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(17+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(17+5), =src[2]
-	sub			r3, #16
+    vld1.u8 {d6-d8}, [r0], r1       //use 21(17+5), =src[0]
+    vld1.u8 {d9-d11}, [r0], r1  //use 21(17+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+    sub         r3, #16
 
 w17_hv_mc_luma_loop:
 
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{d0, d1}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d3, d6, d9, d12, d15, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d4, d7,d10, d13, d16,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0   //output to q0[0]
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d5, d8,d11, d14, d17,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1    //output to q0[1]
+    vst1.u8 {d0, d1}, [r2]!         //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d2, d22, d23, q11 //output to d2[0]
+    vst1.u8 {d2[0]}, [r2], r3       //write 16th Byte
 
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-	vst1.u8	{d3, d4}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d5, d22, d23, q11 //output to d5[0]
-	vst1.u8	{d5[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d0-d2}, [r0], r1       //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d6, d9, d12, d15, d0, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d7,d10, d13, d16, d1,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3   //output to d3
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d8,d11, d14, d17, d2,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4    //output to d4
+    vst1.u8 {d3, d4}, [r2]!     //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d5, d22, d23, q11 //output to d5[0]
+    vst1.u8 {d5[0]}, [r2], r3       //write 16th Byte
 
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d8, d22, d23, q11 //output to d8[0]
-	vst1.u8	{d8[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d3-d5}, [r0], r1       //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d9, d12, d15, d0, d3, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7,d10, d13, d16, d1, d4,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6   //output to d6
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d8,d11, d14, d17, d2, d5,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7    //output to d7
+    vst1.u8 {d6, d7}, [r2]!     //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d8, d22, d23, q11 //output to d8[0]
+    vst1.u8 {d8[0]}, [r2], r3       //write 16th Byte
 
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d11, d22, d23, q11 //output to d11[0]
-	vst1.u8	{d11[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d6-d8}, [r0], r1       //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS      d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9   //output to d9
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10   //output to d10
+    vst1.u8 {d9, d10}, [r2]!        //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d11, d22, d23, q11 //output to d11[0]
+    vst1.u8 {d11[0]}, [r2], r3      //write 16th Byte
 
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2
-	vmov	q2, q8
+    //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+    vswp    q0, q6
+    vswp    q6, q3
+    vmov    q5, q2
+    vmov    q2, q8
 
-	vmov	d20,d8
-	vmov	q4, q1
-	vmov	q1, q7
-	vmov	d14,d20
+    vmov    d20,d8
+    vmov    q4, q1
+    vmov    q1, q7
+    vmov    d14,d20
 
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w17_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+    sub     r4, #4
+    cmp     r4, #1
+    bne     w17_hv_mc_luma_loop
+    //the last row
+    vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d3, d6, d9, d12, d15, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d4, d7,d10, d13, d16,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0   //output to q0[0]
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d5, d8,d11, d14, d17,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1    //output to q0[1]
+    vst1.u8 {q0}, [r2]!         //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d2, d22, d23, q11 //output to d2[0]
+    vst1.u8 {d2[0]}, [r2], r3       //write 16th Byte
 
-	vpop		{q4-q7}
-	pop		{r4}
+    vpop        {q4-q7}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
-	push		{r4}
-	vpush		{q4}
-	ldr			r4, [sp, #20]
+    push        {r4}
+    vpush       {q4}
+    ldr         r4, [sp, #20]
 
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2              //src[-2]
+    sub         r0, r0, r1, lsl #1  //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 14(9+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 14(9+5), =src[-1]
+    vmov.u16    q14, #0x0014        // 20
+    vld1.u8 {q0}, [r0], r1  //use 14(9+5), =src[-2]
+    vld1.u8 {q1}, [r0], r1  //use 14(9+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2        // 5
 
-	vld1.u8	{q2}, [r0], r1	//use 14(9+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 14(9+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 14(9+5), =src[2]
-	sub			r3, #8
+    vld1.u8 {q2}, [r0], r1  //use 14(9+5), =src[0]
+    vld1.u8 {q3}, [r0], r1  //use 14(9+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {q4}, [r0], r1  //use 14(9+5), =src[2]
+    sub         r3, #8
 
 w9_hv_mc_luma_loop:
 
-	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q8}, [r0], r1  //use 14(9+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d16, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d17, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d4, d6, d8, d16, d0, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d5, d7, d9, d17, d1, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d6, d8, d16, d0, d2, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d7, d9, d17, d1, d3, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!			//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d8, d16, d0, d2, d4, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7, d9, d17, d1, d3, d5, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!          //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	//q4~q8, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1
-	vmov	q1, q8
+    //q4~q8, q0~q2, --> q0~q4
+    vswp    q0, q4
+    vswp    q2, q4
+    vmov    q3, q1
+    vmov    q1, q8
 
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w9_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-	vpop		{q4}
-	pop		{r4}
+    sub     r4, #4
+    cmp     r4, #1
+    bne     w9_hv_mc_luma_loop
+    //the last row
+    vld1.u8 {q8}, [r0], r1  //use 14(9+5), =src[3]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d16, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d17, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
+    vpop        {q4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
+    ldr         r5, [sp, #16]
+    ldr         r6, [sp, #20]
 
 enc_w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2], r3
-	vld1.u8		{q1}, [r4], r5
-	vld1.u8		{q2}, [r2], r3
-	vld1.u8		{q3}, [r4], r5
+    vld1.u8     {q0}, [r2], r3
+    vld1.u8     {q1}, [r4], r5
+    vld1.u8     {q2}, [r2], r3
+    vld1.u8     {q3}, [r4], r5
 
-	vld1.u8		{q8}, [r2], r3
-	vld1.u8		{q9}, [r4], r5
-	vld1.u8		{q10}, [r2], r3
-	vld1.u8		{q11}, [r4], r5
+    vld1.u8     {q8}, [r2], r3
+    vld1.u8     {q9}, [r4], r5
+    vld1.u8     {q10}, [r2], r3
+    vld1.u8     {q11}, [r4], r5
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {q0}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {q2}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d16, d16, d18
-	AVERAGE_TWO_8BITS		d17, d17, d19
-	vst1.u8		{q8}, [r0], r1
+    AVERAGE_TWO_8BITS       d16, d16, d18
+    AVERAGE_TWO_8BITS       d17, d17, d19
+    vst1.u8     {q8}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d20, d20, d22
-	AVERAGE_TWO_8BITS		d21, d21, d23
-	vst1.u8		{q10}, [r0], r1
+    AVERAGE_TWO_8BITS       d20, d20, d22
+    AVERAGE_TWO_8BITS       d21, d21, d23
+    vst1.u8     {q10}, [r0], r1
 
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w16_pix_avg_loop
+    sub         r6, #4
+    cmp         r6, #0
+    bne         enc_w16_pix_avg_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
+    ldr         r5, [sp, #16]
+    ldr         r6, [sp, #20]
 enc_w8_pix_avg_loop:
 
-	vld1.u8		{d0}, [r2], r3
-	vld1.u8		{d2}, [r4], r5
-	vld1.u8		{d1}, [r2], r3
-	vld1.u8		{d3}, [r4], r5
+    vld1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r4], r5
+    vld1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r4], r5
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {d0}, [r0], r1
+    vst1.u8     {d1}, [r0], r1
 
-	vld1.u8		{d4}, [r2], r3
-	vld1.u8		{d6}, [r4], r5
-	vld1.u8		{d5}, [r2], r3
-	vld1.u8		{d7}, [r4], r5
+    vld1.u8     {d4}, [r2], r3
+    vld1.u8     {d6}, [r4], r5
+    vld1.u8     {d5}, [r2], r3
+    vld1.u8     {d7}, [r4], r5
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {d4}, [r0], r1
+    vst1.u8     {d5}, [r0], r1
 
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w8_pix_avg_loop
+    sub         r6, #4
+    cmp         r6, #0
+    bne         enc_w8_pix_avg_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -53,88 +53,88 @@
     sub x8, x8, #1
     cbnz x8, _expand_picture_luma_loop2
     //for the top and bottom expand
-	add x2, x2, #64
-	sub x0, x0, #32
+    add x2, x2, #64
+    sub x0, x0, #32
     madd x4, x1, x3, x0
     sub x4, x4, x1
 _expand_picture_luma_loop0:
-	mov x5, #32
+    mov x5, #32
     msub x5, x5, x1, x0
-	add x6, x4, x1
+    add x6, x4, x1
     ld1 {v0.16b}, [x0], x10
     ld1 {v1.16b}, [x4], x10
-	mov x8, #32
+    mov x8, #32
 _expand_picture_luma_loop1:
-	st1 {v0.16b}, [x5], x1
-	st1 {v1.16b}, [x6], x1
-	sub x8, x8, #1
+    st1 {v0.16b}, [x5], x1
+    st1 {v1.16b}, [x6], x1
+    sub x8, x8, #1
     cbnz x8, _expand_picture_luma_loop1
 
-	sub x2, x2, #16
-	cbnz x2, _expand_picture_luma_loop0
+    sub x2, x2, #16
+    cbnz x2, _expand_picture_luma_loop0
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
-	//Save the dst
-	mov x7, x0
-	mov x8, x3
+    //Save the dst
+    mov x7, x0
+    mov x8, x3
     mov x10, #16
-	add x4, x7, x2
-	sub x4, x4, #1
+    add x4, x7, x2
+    sub x4, x4, #1
     //For the left and right expand
 _expand_picture_chroma_loop2:
-	sub x5, x7, #16
-	add x6, x4, #1
+    sub x5, x7, #16
+    add x6, x4, #1
 
-	ld1r {v0.16b}, [x7], x1
-	ld1r {v1.16b}, [x4], x1
+    ld1r {v0.16b}, [x7], x1
+    ld1r {v1.16b}, [x4], x1
 
-	st1 {v0.16b}, [x5]
-	st1 {v1.16b}, [x6]
-	sub x8, x8, #1
-	cbnz x8, _expand_picture_chroma_loop2
+    st1 {v0.16b}, [x5]
+    st1 {v1.16b}, [x6]
+    sub x8, x8, #1
+    cbnz x8, _expand_picture_chroma_loop2
 
-	//for the top and bottom expand
-	add x2, x2, #32
+    //for the top and bottom expand
+    add x2, x2, #32
     //
     mov x9, x2
     mov x11, #15
     bic x2, x2, x11
     //
-	sub x0, x0, #16
-	madd x4, x1, x3, x0
-	sub x4, x4, x1
+    sub x0, x0, #16
+    madd x4, x1, x3, x0
+    sub x4, x4, x1
 _expand_picture_chroma_loop0:
-	mov x5, #16
+    mov x5, #16
     msub x5, x5, x1, x0
-	add x6, x4, x1
-	ld1 {v0.16b}, [x0], x10
-	ld1 {v1.16b}, [x4], x10
+    add x6, x4, x1
+    ld1 {v0.16b}, [x0], x10
+    ld1 {v1.16b}, [x4], x10
 
-	mov x8, #16
+    mov x8, #16
 _expand_picture_chroma_loop1:
-	st1 {v0.16b}, [x5], x1
-	st1 {v1.16b}, [x6], x1
-	sub x8, x8, #1
+    st1 {v0.16b}, [x5], x1
+    st1 {v1.16b}, [x6], x1
+    sub x8, x8, #1
     cbnz x8, _expand_picture_chroma_loop1
 
-	sub x2, x2, #16
-	cbnz x2, _expand_picture_chroma_loop0
+    sub x2, x2, #16
+    cbnz x2, _expand_picture_chroma_loop0
 
     and x9, x9, #15
     sub x9, x9, #8
     cbnz x9, _expand_picture_chroma_end
-	mov x5, #16
+    mov x5, #16
     msub x5, x5, x1, x0
-	add x6, x4, x1
-	ld1 {v0.8b}, [x0]
-	ld1 {v1.8b}, [x4]
+    add x6, x4, x1
+    ld1 {v0.8b}, [x0]
+    ld1 {v1.8b}, [x4]
 
-	mov x8, #16
+    mov x8, #16
 _expand_picture_chroma_loop3:
-	st1 {v0.8b}, [x5], x1
-	st1 {v1.8b}, [x6], x1
-	sub x8, x8, #1
+    st1 {v0.8b}, [x5], x1
+    st1 {v1.8b}, [x6], x1
+    sub x8, x8, #1
     cbnz x8, _expand_picture_chroma_loop3
 _expand_picture_chroma_end:
 
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -39,31 +39,31 @@
 #ifdef __APPLE__
 
 .macro FILTER_6TAG_8BITS1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun $6.8b, v18.8h, #5
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS2
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun2 $6.16b, v18.8h, #5
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -70,13 +70,13 @@
     sqrshrun $6.8b, v18.8h, #5
     uaddl  v19.8h, $2.8b, $6.8b
     rshrn $6.8b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -83,13 +83,13 @@
     sqrshrun2 $6.16b, v18.8h, #5
     uaddl2  v19.8h, $2.16b, $6.16b
     rshrn2 $6.16b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -96,13 +96,13 @@
     sqrshrun $6.8b, v18.8h, #5
     uaddl  v19.8h, $3.8b, $6.8b
     rshrn $6.8b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -109,134 +109,134 @@
     sqrshrun2 $6.16b, v18.8h, #5
     uaddl2  v19.8h, $3.16b, $6.16b
     rshrn2 $6.16b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_TO_16BITS1
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl	$6.8h, $0.8b, $5.8b		//dst_q=src[-2]+src[3]
-    uaddl	v31.8h, $2.8b, $3.8b	//src[0]+src[1]
-    mla	$6.8h, v31.8h, $7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl	v31.8h, $1.8b, $4.8b	//src[-1]+src[2]
-    mls	$6.8h, v31.8h, $8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl   $6.8h, $0.8b, $5.8b     //dst_q=src[-2]+src[3]
+    uaddl   v31.8h, $2.8b, $3.8b    //src[0]+src[1]
+    mla $6.8h, v31.8h, $7.8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl   v31.8h, $1.8b, $4.8b    //src[-1]+src[2]
+    mls $6.8h, v31.8h, $8.8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_TO_16BITS2
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl2	$6.8h, $0.16b, $5.16b		//dst_q=src[-2]+src[3]
-    uaddl2	v31.8h, $2.16b, $3.16b	//src[0]+src[1]
-    mla	$6.8h, v31.8h, $7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl2	v31.8h, $1.16b, $4.16b	//src[-1]+src[2]
-    mls	$6.8h, v31.8h, $8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl2  $6.8h, $0.16b, $5.16b       //dst_q=src[-2]+src[3]
+    uaddl2  v31.8h, $2.16b, $3.16b  //src[0]+src[1]
+    mla $6.8h, v31.8h, $7.8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl2  v31.8h, $1.16b, $4.16b  //src[-1]+src[2]
+    mls $6.8h, v31.8h, $8.8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
 .macro FILTER_3_IN_16BITS_TO_8BITS1
-//	{	// input:a, b, c, dst_d;
-    sub	$0.8h, $0.8h, $1.8h			//a-b
-    sshr	$0.8h, $0.8h, #2			//(a-b)/4
-    sub	$0.8h, $0.8h, $1.8h			//(a-b)/4-b
-    add	$0.8h, $0.8h, $2.8h			//(a-b)/4-b+c
-    sshr	$0.8h, $0.8h, #2			//((a-b)/4-b+c)/4
-    add	$0.8h, $0.8h, $2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun	$3.8b, $0.8h, #6		//(+32)>>6
-//	}
+//  {   // input:a, b, c, dst_d;
+    sub $0.8h, $0.8h, $1.8h         //a-b
+    sshr    $0.8h, $0.8h, #2            //(a-b)/4
+    sub $0.8h, $0.8h, $1.8h         //(a-b)/4-b
+    add $0.8h, $0.8h, $2.8h         //(a-b)/4-b+c
+    sshr    $0.8h, $0.8h, #2            //((a-b)/4-b+c)/4
+    add $0.8h, $0.8h, $2.8h         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun    $3.8b, $0.8h, #6        //(+32)>>6
+//  }
 .endm
 
 .macro FILTER_3_IN_16BITS_TO_8BITS2
-//	{	// input:a, b, c, dst_d;
-    sub	$0.8h, $0.8h, $1.8h			//a-b
-    sshr	$0.8h, $0.8h, #2			//(a-b)/4
-    sub	$0.8h, $0.8h, $1.8h			//(a-b)/4-b
-    add	$0.8h, $0.8h, $2.8h			//(a-b)/4-b+c
-    sshr	$0.8h, $0.8h, #2			//((a-b)/4-b+c)/4
-    add	$0.8h, $0.8h, $2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun2	$3.16b, $0.8h, #6		//(+32)>>6
-//	}
+//  {   // input:a, b, c, dst_d;
+    sub $0.8h, $0.8h, $1.8h         //a-b
+    sshr    $0.8h, $0.8h, #2            //(a-b)/4
+    sub $0.8h, $0.8h, $1.8h         //(a-b)/4-b
+    add $0.8h, $0.8h, $2.8h         //(a-b)/4-b+c
+    sshr    $0.8h, $0.8h, #2            //((a-b)/4-b+c)/4
+    add $0.8h, $0.8h, $2.8h         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun2   $3.16b, $0.8h, #6       //(+32)>>6
+//  }
 .endm
 
 .macro UNPACK_2_16BITS_TO_ABC
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    ext	$4.16b, $0.16b, $1.16b, #4		//src[0]
-    ext	$3.16b, $0.16b, $1.16b, #6		//src[1]
-    add	$4.8h, $4.8h, $3.8h					//c=src[0]+src[1]
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    ext $4.16b, $0.16b, $1.16b, #4      //src[0]
+    ext $3.16b, $0.16b, $1.16b, #6      //src[1]
+    add $4.8h, $4.8h, $3.8h                 //c=src[0]+src[1]
 
-    ext	$3.16b, $0.16b, $1.16b, #2		//src[-1]
-    ext	$2.16b, $0.16b, $1.16b, #8		//src[2]
-    add	$3.8h, $3.8h, $2.8h					//b=src[-1]+src[2]
+    ext $3.16b, $0.16b, $1.16b, #2      //src[-1]
+    ext $2.16b, $0.16b, $1.16b, #8      //src[2]
+    add $3.8h, $3.8h, $2.8h                 //b=src[-1]+src[2]
 
-    ext	$2.16b, $0.16b, $1.16b, #10		//src[3]
-    add	$2.8h, $2.8h, $0.8h					//a=src[-2]+src[3]
-//	}
+    ext $2.16b, $0.16b, $1.16b, #10     //src[3]
+    add $2.8h, $2.8h, $0.8h                 //a=src[-2]+src[3]
+//  }
 .endm
 
 .macro AVERAGE_TWO_8BITS1
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl	v30.8h, $2.8b, $1.8b
-    rshrn	$0.8b, v30.8h, #1
-//	}
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl   v30.8h, $2.8b, $1.8b
+    rshrn   $0.8b, v30.8h, #1
+//  }
 .endm
 
 .macro AVERAGE_TWO_8BITS2
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl2	v30.8h, $2.16b, $1.16b
-    rshrn2	$0.16b, v30.8h, #1
-//	}
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl2  v30.8h, $2.16b, $1.16b
+    rshrn2  $0.16b, v30.8h, #1
+//  }
 .endm
 
-.macro FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X},
-    rev64	$2.8b, $0.8b				// X[5][4][3][2][1][0]O
-    uaddl	$2.8h, $0.8b, $2.8b			// each 16bits, *[50][41][32][23][14][05]*
-    mul	$2.4h, $2.4h, $1.4h			// 0+1*[50]-5*[41]+20[32]
+.macro FILTER_SINGLE_TAG_8BITS      // when width=17/9, used
+//  {   // input: src_d{Y[0][1][2][3][4][5]X},
+    rev64   $2.8b, $0.8b                // X[5][4][3][2][1][0]O
+    uaddl   $2.8h, $0.8b, $2.8b         // each 16bits, *[50][41][32][23][14][05]*
+    mul $2.4h, $2.4h, $1.4h         // 0+1*[50]-5*[41]+20[32]
     addv $3, $2.4h
     sqrshrun $0.8b, $0.8h, #5
-//	}
+//  }
 .endm
 
 .macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-//	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+//  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
     ext.16b $3, $1, $1, #14       // X[0][1][2][3][4][5]O
     ext.16b $4, $3, $3, #8      // [3][4][5]OX[0][1][2]
-    rev64  $4.8h, $4.8h			// X[5][4][3][2][1][0]O
+    rev64  $4.8h, $4.8h         // X[5][4][3][2][1][0]O
     add   $3.8h, $3.8h, $4.8h    // each 16bits, *[50][41][32][23][14][05]*
-    smull $3.4s, $3.4h, $2.4h			// 0+1*[50]-5*[41]+20[32]
+    smull $3.4s, $3.4h, $2.4h           // 0+1*[50]-5*[41]+20[32]
     saddlv $5, $3.4s
     //sshr $0.2d, $0.2d, #4
     sqrshrun $0.2s, $0.2d, #10
     uqxtn $0.4h, $0.4s
     uqxtn $0.8b, $0.8h
-   //	}
+   //   }
 .endm
 
 #else
 .macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
-    uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
+    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun \arg6\().8b, v18.8h, #5
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
+    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun2 \arg6\().16b, v18.8h, #5
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
-    uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
+    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -243,13 +243,13 @@
     sqrshrun \arg6\().8b, v18.8h, #5
     uaddl  v19.8h, \arg2\().8b, \arg6\().8b
     rshrn \arg6\().8b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
+    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -256,13 +256,13 @@
     sqrshrun2 \arg6\().16b, v18.8h, #5
     uaddl2  v19.8h, \arg2\().16b, \arg6\().16b
     rshrn2 \arg6\().16b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
-    uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
+    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -269,13 +269,13 @@
     sqrshrun \arg6\().8b, v18.8h, #5
     uaddl  v19.8h, \arg3\().8b, \arg6\().8b
     rshrn \arg6\().8b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
+    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -282,106 +282,106 @@
     sqrshrun2 \arg6\().16b, v18.8h, #5
     uaddl2  v19.8h, \arg3\().16b, \arg6\().16b
     rshrn2 \arg6\().16b, v19.8h, #1
-//	}
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl	\arg6\().8h, \arg0\().8b, \arg5\().8b		//dst_q=src[-2]+src[3]
-    uaddl	v31.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
-    mla	\arg6\().8h, v31.8h, \arg7\().8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl	v31.8h, \arg1\().8b, \arg4\().8b	//src[-1]+src[2]
-    mls	\arg6\().8h, v31.8h, \arg8\().8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl   \arg6\().8h, \arg0\().8b, \arg5\().8b       //dst_q=src[-2]+src[3]
+    uaddl   v31.8h, \arg2\().8b, \arg3\().8b    //src[0]+src[1]
+    mla \arg6\().8h, v31.8h, \arg7\().8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl   v31.8h, \arg1\().8b, \arg4\().8b    //src[-1]+src[2]
+    mls \arg6\().8h, v31.8h, \arg8\().8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
 .macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl2	\arg6\().8h, \arg0\().16b, \arg5\().16b		//dst_q=src[-2]+src[3]
-    uaddl2	v31.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
-    mla	\arg6\().8h, v31.8h, \arg7\().8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl2	v31.8h, \arg1\().16b, \arg4\().16b	//src[-1]+src[2]
-    mls	\arg6\().8h, v31.8h, \arg8\().8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl2  \arg6\().8h, \arg0\().16b, \arg5\().16b     //dst_q=src[-2]+src[3]
+    uaddl2  v31.8h, \arg2\().16b, \arg3\().16b  //src[0]+src[1]
+    mla \arg6\().8h, v31.8h, \arg7\().8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl2  v31.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
+    mls \arg6\().8h, v31.8h, \arg8\().8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
 .macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//a-b
-    sshr	\arg0\().8h, \arg0\().8h, #2			//(a-b)/4
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//(a-b)/4-b
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//(a-b)/4-b+c
-    sshr	\arg0\().8h, \arg0\().8h, #2			//((a-b)/4-b+c)/4
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun	\arg3\().8b, \arg0\().8h, #6		//(+32)>>6
-//	}
+//  {   // input:a, b, c, dst_d;
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //a-b
+    sshr    \arg0\().8h, \arg0\().8h, #2            //(a-b)/4
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //(a-b)/4-b
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //(a-b)/4-b+c
+    sshr    \arg0\().8h, \arg0\().8h, #2            //((a-b)/4-b+c)/4
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun    \arg3\().8b, \arg0\().8h, #6        //(+32)>>6
+//  }
 .endm
 
 .macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//a-b
-    sshr	\arg0\().8h, \arg0\().8h, #2			//(a-b)/4
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//(a-b)/4-b
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//(a-b)/4-b+c
-    sshr	\arg0\().8h, \arg0\().8h, #2			//((a-b)/4-b+c)/4
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun2	\arg3\().16b, \arg0\().8h, #6		//(+32)>>6
-//	}
+//  {   // input:a, b, c, dst_d;
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //a-b
+    sshr    \arg0\().8h, \arg0\().8h, #2            //(a-b)/4
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //(a-b)/4-b
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //(a-b)/4-b+c
+    sshr    \arg0\().8h, \arg0\().8h, #2            //((a-b)/4-b+c)/4
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun2   \arg3\().16b, \arg0\().8h, #6       //(+32)>>6
+//  }
 .endm
 
 .macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    ext	\arg4\().16b, \arg0\().16b, \arg1\().16b, #4		//src[0]
-    ext	\arg3\().16b, \arg0\().16b, \arg1\().16b, #6		//src[1]
-    add	\arg4\().8h, \arg4\().8h, \arg3\().8h					//c=src[0]+src[1]
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4        //src[0]
+    ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6        //src[1]
+    add \arg4\().8h, \arg4\().8h, \arg3\().8h                   //c=src[0]+src[1]
 
-    ext	\arg3\().16b, \arg0\().16b, \arg1\().16b, #2		//src[-1]
-    ext	\arg2\().16b, \arg0\().16b, \arg1\().16b, #8		//src[2]
-    add	\arg3\().8h, \arg3\().8h, \arg2\().8h					//b=src[-1]+src[2]
+    ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2        //src[-1]
+    ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8        //src[2]
+    add \arg3\().8h, \arg3\().8h, \arg2\().8h                   //b=src[-1]+src[2]
 
-    ext	\arg2\().16b, \arg0\().16b, \arg1\().16b, #10		//src[3]
-    add	\arg2\().8h, \arg2\().8h, \arg0\().8h					//a=src[-2]+src[3]
-//	}
+    ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10       //src[3]
+    add \arg2\().8h, \arg2\().8h, \arg0\().8h                   //a=src[-2]+src[3]
+//  }
 .endm
 
 .macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl	v30.8h, \arg2\().8b, \arg1\().8b
-    rshrn	\arg0\().8b, v30.8h, #1
-//	}
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl   v30.8h, \arg2\().8b, \arg1\().8b
+    rshrn   \arg0\().8b, v30.8h, #1
+//  }
 .endm
 
 .macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl2	v30.8h, \arg2\().16b, \arg1\().16b
-    rshrn2	\arg0\().16b, v30.8h, #1
-//	}
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl2  v30.8h, \arg2\().16b, \arg1\().16b
+    rshrn2  \arg0\().16b, v30.8h, #1
+//  }
 .endm
 
 .macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
 // when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X},
-    rev64	\arg2\().8b, \arg0\().8b				// X[5][4][3][2][1][0]O
-    uaddl	\arg2\().8h, \arg0\().8b, \arg2\().8b			// each 16bits, *[50][41][32][23][14][05]*
-    mul	\arg2\().4h, \arg2\().4h, \arg1\().4h			// 0+1*[50]-5*[41]+20[32]
+//  {   // input: src_d{Y[0][1][2][3][4][5]X},
+    rev64   \arg2\().8b, \arg0\().8b                // X[5][4][3][2][1][0]O
+    uaddl   \arg2\().8h, \arg0\().8b, \arg2\().8b           // each 16bits, *[50][41][32][23][14][05]*
+    mul \arg2\().4h, \arg2\().4h, \arg1\().4h           // 0+1*[50]-5*[41]+20[32]
     addv \arg3, \arg2\().4h
     sqrshrun \arg0\().8b, \arg0\().8h, #5
-//	}
+//  }
 .endm
 
 .macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
-//	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+//  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
     ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14       // X[0][1][2][3][4][5]O
     ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8      // [3][4][5]OX[0][1][2]
-    rev64  \arg4\().8h, \arg4\().8h			// X[5][4][3][2][1][0]O
+    rev64  \arg4\().8h, \arg4\().8h         // X[5][4][3][2][1][0]O
     add   \arg3\().8h, \arg3\().8h, \arg4\().8h    // each 16bits, *[50][41][32][23][14][05]*
-    smull \arg3\().4s, \arg3\().4h, \arg2\().4h			// 0+1*[50]-5*[41]+20[32]
+    smull \arg3\().4s, \arg3\().4h, \arg2\().4h         // 0+1*[50]-5*[41]+20[32]
     saddlv \arg5, \arg3\().4s
     //sshr \arg0\().2d, \arg0\().2d, #4
     sqrshrun \arg0\().2s, \arg0\().2d, #10
     uqxtn \arg0\().4h, \arg0\().4s
     uqxtn \arg0\().8b, \arg0\().8h
-   //	}
+   //   }
 .endm
 #endif
 
@@ -405,7 +405,7 @@
 
     sub x4, x4, #1
     st1 {v20.16b}, [x2], x3 //write 16Byte
-	cbnz x4, w16_h_mc_luma_loop
+    cbnz x4, w16_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
@@ -426,7 +426,7 @@
 
     sub x4, x4, #1
     st1 {v20.8b}, [x2], x3 //write 8Byte
-	cbnz x4, w8_h_mc_luma_loop
+    cbnz x4, w8_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
@@ -461,7 +461,7 @@
     st1 {v20.s}[0], [x2], x3 //write 4Byte
     st1 {v20.s}[1], [x2], x3 //write 4Byte
     sub x4, x4, #1
-	cbnz x4, w4_h_mc_luma_loop
+    cbnz x4, w4_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
@@ -483,7 +483,7 @@
 
     sub x4, x4, #1
     st1 {v20.16b}, [x2], x3 //write 16Byte
-	cbnz x4, w16_xy_10_mc_luma_loop
+    cbnz x4, w16_xy_10_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -505,7 +505,7 @@
 
     sub x4, x4, #1
     st1 {v20.8b}, [x2], x3 //write 8Byte
-	cbnz x4, w8_xy_10_mc_luma_loop
+    cbnz x4, w8_xy_10_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
@@ -540,7 +540,7 @@
     st1 {v20.s}[0], [x2], x3 //write 4Byte
     st1 {v20.s}[1], [x2], x3 //write 4Byte
     sub x4, x4, #1
-	cbnz x4, w4_xy_10_mc_luma_loop
+    cbnz x4, w4_xy_10_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -563,7 +563,7 @@
 
     sub x4, x4, #1
     st1 {v20.16b}, [x2], x3 //write 16Byte
-	cbnz x4, w16_xy_30_mc_luma_loop
+    cbnz x4, w16_xy_30_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -585,7 +585,7 @@
 
     sub x4, x4, #1
     st1 {v20.8b}, [x2], x3 //write 8Byte
-	cbnz x4, w8_xy_30_mc_luma_loop
+    cbnz x4, w8_xy_30_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
@@ -620,7 +620,7 @@
     st1 {v20.s}[0], [x2], x3 //write 4Byte
     st1 {v20.s}[1], [x2], x3 //write 4Byte
     sub x4, x4, #1
-	cbnz x4, w4_xy_30_mc_luma_loop
+    cbnz x4, w4_xy_30_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -703,7 +703,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w16_xy_01_mc_luma_loop
+    cbnz x4, w16_xy_01_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -753,7 +753,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w8_xy_01_mc_luma_loop
+    cbnz x4, w8_xy_01_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -809,7 +809,7 @@
     mov.8b v5, v21
 
     sub x4, x4, #4
-	cbnz x4, w4_xy_01_mc_luma_loop
+    cbnz x4, w4_xy_01_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -892,7 +892,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w16_xy_03_mc_luma_loop
+    cbnz x4, w16_xy_03_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -942,7 +942,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w8_xy_03_mc_luma_loop
+    cbnz x4, w8_xy_03_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -998,7 +998,7 @@
     mov.8b v5, v21
 
     sub x4, x4, #4
-	cbnz x4, w4_xy_03_mc_luma_loop
+    cbnz x4, w4_xy_03_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1081,7 +1081,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w16_xy_02_mc_luma_loop
+    cbnz x4, w16_xy_02_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1131,7 +1131,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w8_xy_02_mc_luma_loop
+    cbnz x4, w8_xy_02_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1187,7 +1187,7 @@
     mov.8b v5, v21
 
     sub x4, x4, #4
-	cbnz x4, w4_xy_02_mc_luma_loop
+    cbnz x4, w4_xy_02_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1220,12 +1220,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1234,12 +1234,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1248,12 +1248,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1262,12 +1262,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1276,12 +1276,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1290,12 +1290,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1304,12 +1304,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1318,12 +1318,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
 
     mov.16b v5, v11
@@ -1348,7 +1348,7 @@
     mov.16b v16, v30
 
     sub x4, x4, #8
-	cbnz x4, w16_hv_mc_luma_loop
+    cbnz x4, w16_hv_mc_luma_loop
 
     ldp d14, d15, [sp], #16
     ldp d12, d13, [sp], #16
@@ -1381,8 +1381,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1391,8 +1391,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1401,8 +1401,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1411,8 +1411,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
 
 
@@ -1424,7 +1424,7 @@
     mov.16b v4, v30
 
     sub x4, x4, #4
-	cbnz x4, w8_hv_mc_luma_loop
+    cbnz x4, w8_hv_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1458,12 +1458,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v24, v25, v26
-    UNPACK_2_16BITS_TO_ABC	v22, v23, v28, v29, v30
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v24, v25, v26
+    UNPACK_2_16BITS_TO_ABC  v22, v23, v28, v29, v30
     zip1 v24.2d, v24.2d, v28.2d
     zip1 v25.2d, v25.2d, v29.2d
     zip1 v26.2d, v26.2d, v30.2d
-	FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27	//output to v27[0]
+    FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
     st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line
     st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line
 
@@ -1478,12 +1478,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v24, v25, v26
-    UNPACK_2_16BITS_TO_ABC	v22, v23, v28, v29, v30
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v24, v25, v26
+    UNPACK_2_16BITS_TO_ABC  v22, v23, v28, v29, v30
     zip1 v24.2d, v24.2d, v28.2d
     zip1 v25.2d, v25.2d, v29.2d
     zip1 v26.2d, v26.2d, v30.2d
-	FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27	//output to v27[0]
+    FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
     st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
     st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
 
@@ -1495,7 +1495,7 @@
     mov.16b v4, v30
 
     sub x4, x4, #4
-	cbnz x4, w4_hv_mc_luma_loop
+    cbnz x4, w4_hv_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
@@ -1509,7 +1509,7 @@
     st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line
 
     sub x4, x4, #2
-	cbnz x4, w16_copy_loop
+    cbnz x4, w16_copy_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
@@ -1523,7 +1523,7 @@
     st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line
 
     sub x4, x4, #2
-	cbnz x4, w8_copy_loop
+    cbnz x4, w8_copy_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
@@ -1537,7 +1537,7 @@
     st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line
 
     sub x4, x4, #2
-	cbnz x4, w4_copy_loop
+    cbnz x4, w4_copy_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
@@ -1570,7 +1570,7 @@
     st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, enc_w16_pix_avg_loop
+    cbnz x6, enc_w16_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
@@ -1607,7 +1607,7 @@
     st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, enc_w8_pix_avg_loop
+    cbnz x6, enc_w8_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
@@ -1649,7 +1649,7 @@
     st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, w16_pix_avg_loop
+    cbnz x6, w16_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
@@ -1686,7 +1686,7 @@
     st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, w8_pix_avg_loop
+    cbnz x6, w8_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1707,7 +1707,7 @@
     st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line
 
     sub x6, x6, #2
-	cbnz x6, w4_pix_avg_loop
+    cbnz x6, w4_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
@@ -1738,7 +1738,7 @@
     mov.16b v0, v18
     mov.16b v1, v19
     sub x5, x5, #2
-	cbnz x5, w8_mc_chroma_loop
+    cbnz x5, w8_mc_chroma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
@@ -1767,7 +1767,7 @@
     mov.8b v0, v18
     mov.8b v1, v19
     sub x5, x5, #2
-	cbnz x5, w4_mc_chroma_loop
+    cbnz x5, w4_mc_chroma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1793,11 +1793,11 @@
     st1 {v20.16b}, [x2], x5 //write 16Byte
 
     ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	v21, v22, v23, h21
-	st1 {v21.b}[0], [x2], x3 //write 16th Byte
+    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+    st1 {v21.b}[0], [x2], x3 //write 16th Byte
 
     sub x4, x4, #1
-	cbnz x4, w17_h_mc_luma_loop
+    cbnz x4, w17_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
@@ -1821,11 +1821,11 @@
     st1 {v20.8b}, [x2], x5 //write 8Byte
 
     ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	v21, v22, v23, h21
-	st1 {v21.b}[0], [x2], x3 //write 9th Byte
+    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+    st1 {v21.b}[0], [x2], x3 //write 9th Byte
 
     sub x4, x4, #1
-	cbnz x4, w9_h_mc_luma_loop
+    cbnz x4, w9_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1863,12 +1863,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -1879,12 +1879,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line
@@ -1895,12 +1895,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line
@@ -1911,12 +1911,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line
@@ -1927,12 +1927,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line
@@ -1943,12 +1943,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line
@@ -1959,12 +1959,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line
@@ -1975,12 +1975,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
@@ -2007,7 +2007,7 @@
     mov.16b v16, v30
 
     sub x4, x4, #8
-	cbnz x4, w17_hv_mc_luma_loop
+    cbnz x4, w17_hv_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
@@ -2015,12 +2015,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -2061,8 +2061,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2073,8 +2073,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line
@@ -2085,8 +2085,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line
@@ -2097,8 +2097,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
@@ -2112,7 +2112,7 @@
     mov.16b v4, v30
 
     sub x4, x4, #4
-	cbnz x4, w9_hv_mc_luma_loop
+    cbnz x4, w9_hv_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2120,8 +2120,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2207,7 +2207,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w17_v_mc_luma_loop
+    cbnz x4, w17_v_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2262,7 +2262,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w9_v_mc_luma_loop
+    cbnz x4, w9_v_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -44,15 +44,15 @@
 ;***********************************************************************
 
 %if 1
-	%define MOVDQ movdqa
+    %define MOVDQ movdqa
 %else
-	%define MOVDQ movdqu
+    %define MOVDQ movdqu
 %endif
 
 %if 1
-	%define WELSEMMS	emms
+    %define WELSEMMS emms
 %else
-	%define WELSEMMS
+    %define WELSEMMS
 %endif
 
 
@@ -220,7 +220,7 @@
 
 %macro LOAD_1_PARA 0
     %ifdef X86_32
-	mov r0, [esp + push_num*4 + 4]
+        mov r0, [esp + push_num*4 + 4]
     %endif
 %endmacro
 
@@ -234,8 +234,8 @@
 %macro LOAD_3_PARA 0
     %ifdef X86_32
         mov r0, [esp + push_num*4 + 4]
-	mov r1, [esp + push_num*4 + 8]
-	mov r2, [esp + push_num*4 + 12]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
     %endif
 %endmacro
 
@@ -267,7 +267,7 @@
 
 %macro LOAD_6_PARA 0
     %ifdef X86_32
-	push r3
+        push r3
         push r4
         push r5
         %assign  push_num push_num+3
@@ -310,7 +310,7 @@
 
 %macro LOAD_4_PARA_POP 0
     %ifdef X86_32
-	pop r3
+        pop r3
     %endif
 %endmacro
 
@@ -317,7 +317,7 @@
 %macro LOAD_5_PARA_POP 0
     %ifdef X86_32
         pop r4
-	pop r3
+        pop r3
     %endif
 %endmacro
 
@@ -324,8 +324,8 @@
 %macro LOAD_6_PARA_POP 0
     %ifdef X86_32
         pop r5
-  	pop r4
- 	pop r3
+        pop r4
+        pop r3
     %endif
 %endmacro
 
@@ -416,13 +416,13 @@
 
 %macro SIGN_EXTENSION 2
     %ifndef X86_32
-            movsxd %1, %2
+        movsxd %1, %2
     %endif
 %endmacro
 
 %macro SIGN_EXTENSIONW 2
     %ifndef X86_32
-            movsx %1, %2
+        movsx %1, %2
     %endif
 %endmacro
 
@@ -438,13 +438,13 @@
 %endmacro
 
 %macro WELS_AbsW 2
-	pxor        %2, %2
+    pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
 %endmacro
 
 %macro MMX_XSwap  4
-    movq		%4, %2
+    movq        %4, %2
     punpckh%1   %4, %3
     punpckl%1   %2, %3
 %endmacro
@@ -485,35 +485,35 @@
 ;in:  m1, m2, m3, m4, m5, m6, m7, m8
 ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
 %macro SSE2_TransTwo8x8B 9
-	movdqa	%9,	%8
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%9, %4
-	SSE2_XSawp bw,  %7, %6, %4
+    movdqa  %9,     %8
+    SSE2_XSawp bw,  %1, %2, %8
+    SSE2_XSawp bw,  %3, %4, %2
+    SSE2_XSawp bw,  %5, %6, %4
+    movdqa  %6, %9
+    movdqa  %9, %4
+    SSE2_XSawp bw,  %7, %6, %4
 
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %3
-	SSE2_XSawp wd,  %7, %4, %3
+    SSE2_XSawp wd,  %1, %3, %6
+    SSE2_XSawp wd,  %8, %2, %3
+    SSE2_XSawp wd,  %5, %7, %2
+    movdqa  %7, %9
+    movdqa  %9, %3
+    SSE2_XSawp wd,  %7, %4, %3
 
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %5
-	SSE2_XSawp dq,  %7, %3, %5
+    SSE2_XSawp dq,  %1, %5, %4
+    SSE2_XSawp dq,  %6, %2, %5
+    SSE2_XSawp dq,  %8, %7, %2
+    movdqa  %7, %9
+    movdqa  %9, %5
+    SSE2_XSawp dq,  %7, %3, %5
 
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %9
+    SSE2_XSawp qdq,  %1, %8, %3
+    SSE2_XSawp qdq,  %4, %2, %8
+    SSE2_XSawp qdq,  %6, %7, %2
+    movdqa  %7, %9
+    movdqa  %9, %1
+    SSE2_XSawp qdq,  %7, %5, %1
+    movdqa  %5, %9
 %endmacro
 
 ;xmm0, xmm6, xmm7, [eax], [ecx]
@@ -528,32 +528,32 @@
 
 ; m2 = m1 + m2, m1 = m1 - m2
 %macro SSE2_SumSub 3
-	movdqa  %3, %2
+    movdqa  %3, %2
     paddw   %2, %1
     psubw   %1, %3
 %endmacro
 
 
-%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%macro butterfly_1to16_sse      3       ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+    mov %3h, %3l
+    movd %1, e%3x           ; i.e, 1% = eax (=b0)
+    pshuflw %2, %1, 00h     ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+    pshufd %1, %2, 00h      ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
 %endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro SSE2_Copy8Times 2
-		movd	%1, %2
-		punpcklwd %1, %1
-		pshufd	%1,	%1,	0
+    movd    %1, %2
+    punpcklwd %1, %1
+    pshufd  %1,     %1,     0
 %endmacro
 
 ;copy a db into a xmm for 16 times
 %macro SSE2_Copy16Times 2
-		movd		%1, %2
-		pshuflw		%1, %1, 0
-		punpcklqdq	%1, %1
-		packuswb	%1,	%1
+    movd            %1, %2
+    pshuflw         %1, %1, 0
+    punpcklqdq      %1, %1
+    packuswb        %1,     %1
 %endmacro
 
 
@@ -564,35 +564,35 @@
 ;dw 32,32,32,32,32,32,32,32 for xmm
 ;dw 32,32,32,32 for mm
 %macro WELS_DW32 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	psllw %1,5
+    pcmpeqw %1,%1
+    psrlw %1,15
+    psllw %1,5
 %endmacro
 
 ;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
 ;dw 1, 1, 1, 1 for mm
 %macro WELS_DW1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
+    pcmpeqw %1,%1
+    psrlw %1,15
 %endmacro
 
 ;all 0 for xmm and mm
 %macro WELS_Zero 1
-	pxor %1, %1
+    pxor %1, %1
 %endmacro
 
 ;dd 1, 1, 1, 1 for xmm
 ;dd 1, 1 for mm
 %macro WELS_DD1 1
-	pcmpeqw %1,%1
-	psrld %1,31
+    pcmpeqw %1,%1
+    psrld %1,31
 %endmacro
 
 ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 %macro WELS_DB1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	packuswb %1,%1
+    pcmpeqw %1,%1
+    psrlw %1,15
+    packuswb %1,%1
 %endmacro
 
 
--- a/codec/common/x86/cpuid.asm
+++ b/codec/common/x86/cpuid.asm
@@ -29,13 +29,13 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	cpu_mmx.asm
+;*  cpu_mmx.asm
 ;*
 ;*  Abstract
-;*		verify cpuid feature support and cpuid detection
+;*      verify cpuid feature support and cpuid detection
 ;*
 ;*  History
-;*      04/29/2009	Created
+;*      04/29/2009  Created
 ;*
 ;*************************************************************************/
 
@@ -115,13 +115,13 @@
 %elifdef     X86_32
 
 WELS_EXTERN WelsCPUId
-    push	ebx
-    push	edi
+    push    ebx
+    push    edi
 
-    mov     eax, [esp+12]	; operating index
+    mov     eax, [esp+12]   ; operating index
     mov     edi, [esp+24]
     mov     ecx, [edi]
-    cpuid					; cpuid
+    cpuid                   ; cpuid
 
     ; processing various information return
     mov     edi, [esp+16]
@@ -133,7 +133,7 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-    pop	    edi
+    pop     edi
     pop     ebx
     ret
 
@@ -145,31 +145,31 @@
 ;****************************************************************************************************
 WELS_EXTERN WelsCPUSupportAVX
 %ifdef     WIN64
-        mov   eax,    ecx
-        mov   ecx,    edx
+    mov   eax,    ecx
+    mov   ecx,    edx
 %elifdef   UNIX64
-        mov eax, edi
-        mov ecx, esi
+    mov eax, edi
+    mov ecx, esi
 %else
-        mov eax, [esp+4]
-        mov ecx, [esp+8]
+    mov eax, [esp+4]
+    mov ecx, [esp+8]
 %endif
 
-        ; refer to detection of AVX addressed in INTEL AVX manual document
-        and ecx, 018000000H
-        cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
-        jne avx_not_supported
-        ; processor supports AVX instructions and XGETBV is enabled by OS
-        mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
-        XGETBV                                  ; result in EDX:EAX
-        and eax, 06H
-        cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
-        jne avx_not_supported
-        mov eax, 1
-        ret
+    ; refer to detection of AVX addressed in INTEL AVX manual document
+    and ecx, 018000000H
+    cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
+    jne avx_not_supported
+    ; processor supports AVX instructions and XGETBV is enabled by OS
+    mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
+    XGETBV                                  ; result in EDX:EAX
+    and eax, 06H
+    cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
+    jne avx_not_supported
+    mov eax, 1
+    ret
 avx_not_supported:
-        mov eax, 0
-        ret
+    mov eax, 0
+    ret
 
 
 ; need call after cpuid=1 and eax, ecx flag got then
@@ -178,35 +178,35 @@
 ;****************************************************************************************************
 WELS_EXTERN  WelsCPUSupportFMA
 %ifdef     WIN64
-        mov   eax,   ecx
-        mov   ecx,   edx
+    mov   eax,   ecx
+    mov   ecx,   edx
 %elifdef   UNIX64
-        mov   eax,   edi
-        mov   ecx,   esi
+    mov   eax,   edi
+    mov   ecx,   esi
 %else
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
+    mov eax, [esp+4]
+    mov ecx, [esp+8]
 %endif
-	; refer to detection of FMA addressed in INTEL AVX manual document
-	and ecx, 018001000H
-	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
-	jne fma_not_supported
-	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne fma_not_supported
-	mov eax, 1
-	ret
+    ; refer to detection of FMA addressed in INTEL AVX manual document
+    and ecx, 018001000H
+    cmp ecx, 018001000H     ; check OSXSAVE, AVX, FMA feature flags
+    jne fma_not_supported
+    ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+    mov ecx, 0              ; specify 0 for XFEATURE_ENABLED_MASK register
+    XGETBV                  ; result in EDX:EAX
+    and eax, 06H
+    cmp eax, 06H            ; check OS has enabled both XMM and YMM state support
+    jne fma_not_supported
+    mov eax, 1
+    ret
 fma_not_supported:
-	mov eax, 0
-	ret
+    mov eax, 0
+    ret
 
 ;******************************************************************************************
 ;   void WelsEmms()
 ;******************************************************************************************
 WELS_EXTERN WelsEmms
-	emms	; empty mmx technology states
-	ret
+    emms    ; empty mmx technology states
+    ret
 
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -57,1032 +57,1032 @@
 
 
 WELS_EXTERN   DeblockLumaLt4V_ssse3
-  push        rbp
-  mov         r11,[rsp + 16 + 20h]  ; pTC
-  PUSH_XMM 16
-  sub         rsp,1B0h
-  lea         rbp,[rsp+20h]
-  movd        xmm4,r8d
-  movd        xmm2,r9d
-  mov         qword [rbp+180h],r12
-  mov         r10,rcx
-  movsxd      r12,edx
-  add         edx,edx
-  movsxd      rdx,edx
-  sub         r10,r12
-  movsx       r8d,byte [r11]
-  pxor        xmm3,xmm3
-  punpcklwd   xmm2,xmm2
-  movaps      [rbp+50h],xmm14
-  lea         rax,[r12+r12*2]
-  movdqa      xmm14,[rdx+rcx]
-  neg         rax
-  pshufd      xmm0,xmm2,0
-  movd        xmm2,r8d
-  movsx       edx,byte [r11+1]
-  movsx       r8d,byte [r11+2]
-  movsx       r11d,byte [r11+3]
-  movaps      [rbp+70h],xmm12
-  movd        xmm1,edx
-  movaps      [rbp+80h],xmm11
-  movd        xmm12,r8d
-  movd        xmm11,r11d
-  movdqa      xmm5, [rax+rcx]
-  lea         rax,[r12+r12]
-  punpcklwd   xmm12,xmm12
-  neg         rax
-  punpcklwd   xmm11,xmm11
-  movaps      [rbp],xmm8
-  movdqa      xmm8, [r10]
-  punpcklwd   xmm2,xmm2
-  punpcklwd   xmm1,xmm1
-  punpcklqdq  xmm12,xmm12
-  punpcklqdq  xmm11,xmm11
-  punpcklqdq  xmm2,xmm2
-  punpcklqdq  xmm1,xmm1
-  shufps      xmm12,xmm11,88h
-  movdqa      xmm11,xmm8
-  movaps      [rbp+30h],xmm9
-  movdqa      xmm9,[rcx]
-  shufps      xmm2,xmm1,88h
-  movdqa      xmm1,xmm5
-  punpcklbw   xmm11,xmm3
-  movaps      [rbp+20h],xmm6
-  movaps      [rbp+60h],xmm13
-  movdqa      xmm13,xmm11
-  movaps      [rbp+90h],xmm10
-  movdqa      xmm10,xmm9
-  movdqa      xmm6,[rax+rcx]
-  punpcklbw   xmm1,xmm3
-  movaps      [rbp+0A0h],xmm12
-  psubw       xmm13,xmm1
-  movaps      [rbp+40h],xmm15
-  movdqa      xmm15,xmm14
-  movaps      [rbp+10h],xmm7
-  movdqa      xmm7,xmm6
-  punpcklbw   xmm10,xmm3
-  movdqa      xmm12,[r12+rcx]
-  punpcklbw   xmm7,xmm3
-  punpcklbw   xmm12,xmm3
-  punpcklbw   xmm15,xmm3
-  pabsw       xmm3,xmm13
-  movdqa      xmm13,xmm10
-  psubw       xmm13,xmm15
-  movdqa      [rbp+0F0h],xmm15
-  pabsw       xmm15,xmm13
-  movdqa      xmm13,xmm11
-  movdqa      [rbp+0B0h],xmm1
-  movdqa      xmm1,xmm0
-  pavgw       xmm13,xmm10
-  pcmpgtw     xmm1,xmm3
-  movdqa      [rbp+120h],xmm13
-  movaps      xmm13,xmm2
-  punpcklwd   xmm4,xmm4
-  movdqa      xmm3,xmm0
-  movdqa      [rbp+100h],xmm1
-  psubw       xmm13,xmm1
-  movdqa      xmm1,xmm10
-  pcmpgtw     xmm3,xmm15
-  pshufd      xmm4,xmm4,0
-  psubw       xmm1,xmm11
-  movdqa      [rbp+0D0h],xmm10
-  psubw       xmm13,xmm3
-  movdqa      [rbp+110h],xmm3
-  pabsw       xmm15,xmm1
-  movdqa      xmm3,xmm4
-  psubw       xmm10,xmm12
-  pcmpgtw     xmm3,xmm15
-  pabsw       xmm15,xmm10
-  movdqa      xmm10,xmm0
-  psllw       xmm1,2
-  movdqa      [rbp+0C0h],xmm11
-  psubw       xmm11,xmm7
-  pcmpgtw     xmm10,xmm15
-  pabsw       xmm11,xmm11
-  movdqa      xmm15,xmm0
-  pand        xmm3,xmm10
-  pcmpgtw     xmm15,xmm11
-  movaps      xmm11,xmm2
-  pxor        xmm10,xmm10
-  pand        xmm3,xmm15
-  pcmpgtw     xmm11,xmm10
-  pcmpeqw     xmm10,xmm2
-  por         xmm11,xmm10
-  pand        xmm3,xmm11
-  movdqa      xmm11,xmm7
-  psubw       xmm11,xmm12
-  pxor        xmm15,xmm15
-  paddw       xmm11,xmm1
-  psubw       xmm15,xmm13
-  movdqa      [rbp+0E0h],xmm12
-  paddw       xmm11,[FOUR_16B_SSE2]
-  pxor        xmm12,xmm12
-  psraw       xmm11,3
-  punpckhbw   xmm8,xmm12
-  pmaxsw      xmm15,xmm11
-  punpckhbw   xmm5,xmm12
-  movdqa      xmm11,xmm8
-  pminsw      xmm13,xmm15
-  psubw       xmm11,xmm5
-  punpckhbw   xmm9,xmm12
-  pand        xmm13,xmm3
-  movdqa      [rbp+130h],xmm13
-  pabsw       xmm13,xmm11
-  punpckhbw   xmm14,xmm12
-  movdqa      xmm11,xmm9
-  psubw       xmm11,xmm14
-  movdqa      xmm15,xmm0
-  movdqa      [rbp+140h],xmm14
-  pabsw       xmm14,xmm11
-  movdqa      xmm11,xmm8
-  pcmpgtw     xmm15,xmm14
-  movdqa      xmm1,[r12+rcx]
-  pavgw       xmm11,xmm9
-  movdqa      [rbp+170h],xmm11
-  movdqa      xmm10,xmm9
-  punpckhbw   xmm6,xmm12
-  psubw       xmm10,xmm8
-  punpckhbw   xmm1,xmm12
-  movdqa      xmm12,xmm0
-  movaps      xmm11,[rbp+0A0h]
-  pcmpgtw     xmm12,xmm13
-  movaps      xmm13,xmm11
-  psubw       xmm13,xmm12
-  movdqa      [rbp+160h],xmm15
-  psubw       xmm13,xmm15
-  movdqa      xmm15,xmm9
-  psubw       xmm15,xmm1
-  movdqa      [rbp+150h],xmm12
-  pabsw       xmm12,xmm10
-  pabsw       xmm14,xmm15
-  movdqa      xmm15,xmm8
-  pcmpgtw     xmm4,xmm12
-  movdqa      xmm12,xmm0
-  psubw       xmm15,xmm6
-  pcmpgtw     xmm12,xmm14
-  pabsw       xmm14,xmm15
-  psllw       xmm10,2
-  pcmpgtw     xmm0,xmm14
-  movdqa      xmm14,xmm6
-  psubw       xmm14,xmm1
-  pand        xmm4,xmm12
-  paddw       xmm14,xmm10
-  pand        xmm4,xmm0
-  paddw       xmm14,[FOUR_16B_SSE2]
-  pxor        xmm15,xmm15
-  movaps      xmm12,xmm11
-  psubw       xmm15,xmm13
-  pxor        xmm0,xmm0
-  psraw       xmm14,3
-  pcmpgtw     xmm12,xmm0
-  pcmpeqw     xmm0,xmm11
-  pmaxsw      xmm15,xmm14
-  por         xmm12,xmm0
-  movdqa      xmm0,[rbp+120h]
-  pminsw      xmm13,xmm15
-  movdqa      xmm15,[rbp+0B0h]
-  movdqa      xmm10,xmm7
-  pand        xmm4,xmm12
-  paddw       xmm15,xmm0
-  pxor        xmm12,xmm12
-  paddw       xmm10,xmm7
-  movdqa      xmm14,xmm12
-  psubw       xmm15,xmm10
-  psubw       xmm14,xmm2
-  psraw       xmm15,1
-  pmaxsw      xmm15,xmm14
-  movdqa      xmm10,xmm6
-  pminsw      xmm15,xmm2
-  paddw       xmm10,xmm6
-  pand        xmm15,xmm3
-  psubw       xmm12,xmm11
-  pand        xmm15,[rbp+100h]
-  pand        xmm13,xmm4
-  paddw       xmm7,xmm15
-  paddw       xmm8,xmm13
-  movdqa      xmm15,[rbp+170h]
-  psubw       xmm9,xmm13
-  paddw       xmm5,xmm15
-  psubw       xmm5,xmm10
-  psraw       xmm5,1
-  pmaxsw      xmm5,xmm12
-  pminsw      xmm5,xmm11
-  pand        xmm5,xmm4
-  pand        xmm5,[rbp+150h]
-  paddw       xmm6,xmm5
-  movdqa      xmm5,[rbp+0C0h]
-  packuswb    xmm7,xmm6
-  movdqa      xmm6,[rbp+130h]
-  paddw       xmm5,xmm6
-  packuswb    xmm5,xmm8
-  movdqa      xmm8,[rbp+0D0h]
-  psubw       xmm8,xmm6
-  movdqa      xmm6,[rbp+0F0h]
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[rbp+0E0h]
-  packuswb    xmm8,xmm9
-  movdqa      xmm9,xmm0
-  paddw       xmm9,xmm0
-  psubw       xmm6,xmm9
-  psraw       xmm6,1
-  pmaxsw      xmm14,xmm6
-  pminsw      xmm2,xmm14
-  pand        xmm2,xmm3
-  pand        xmm2,[rbp+110h]
-  paddw       xmm0,xmm2
-  movdqa      xmm2,[rbp+140h]
-  paddw       xmm2,xmm15
-  movdqa      xmm15,xmm1
-  paddw       xmm15,xmm1
-  psubw       xmm2,xmm15
-  psraw       xmm2,1
-  pmaxsw      xmm12,xmm2
-  pminsw      xmm11,xmm12
-  pand        xmm11,xmm4
-  pand        xmm11,[rbp+160h]
-  paddw       xmm1,xmm11
-  movdqa      [rax+rcx],xmm7
-  movdqa      [r10],xmm5
-  packuswb    xmm0,xmm1
-  movdqa      [rcx],xmm8
-  movdqa      [r12+rcx],xmm0
-  mov         r12,qword [rbp+180h]
-  lea         rsp,[rbp+190h]
-  POP_XMM
-  pop         rbp
-  ret
+    push        rbp
+    mov         r11,[rsp + 16 + 20h]  ; pTC
+    PUSH_XMM 16
+    sub         rsp,1B0h
+    lea         rbp,[rsp+20h]
+    movd        xmm4,r8d
+    movd        xmm2,r9d
+    mov         qword [rbp+180h],r12
+    mov         r10,rcx
+    movsxd      r12,edx
+    add         edx,edx
+    movsxd      rdx,edx
+    sub         r10,r12
+    movsx       r8d,byte [r11]
+    pxor        xmm3,xmm3
+    punpcklwd   xmm2,xmm2
+    movaps      [rbp+50h],xmm14
+    lea         rax,[r12+r12*2]
+    movdqa      xmm14,[rdx+rcx]
+    neg         rax
+    pshufd      xmm0,xmm2,0
+    movd        xmm2,r8d
+    movsx       edx,byte [r11+1]
+    movsx       r8d,byte [r11+2]
+    movsx       r11d,byte [r11+3]
+    movaps      [rbp+70h],xmm12
+    movd        xmm1,edx
+    movaps      [rbp+80h],xmm11
+    movd        xmm12,r8d
+    movd        xmm11,r11d
+    movdqa      xmm5, [rax+rcx]
+    lea         rax,[r12+r12]
+    punpcklwd   xmm12,xmm12
+    neg         rax
+    punpcklwd   xmm11,xmm11
+    movaps      [rbp],xmm8
+    movdqa      xmm8, [r10]
+    punpcklwd   xmm2,xmm2
+    punpcklwd   xmm1,xmm1
+    punpcklqdq  xmm12,xmm12
+    punpcklqdq  xmm11,xmm11
+    punpcklqdq  xmm2,xmm2
+    punpcklqdq  xmm1,xmm1
+    shufps      xmm12,xmm11,88h
+    movdqa      xmm11,xmm8
+    movaps      [rbp+30h],xmm9
+    movdqa      xmm9,[rcx]
+    shufps      xmm2,xmm1,88h
+    movdqa      xmm1,xmm5
+    punpcklbw   xmm11,xmm3
+    movaps      [rbp+20h],xmm6
+    movaps      [rbp+60h],xmm13
+    movdqa      xmm13,xmm11
+    movaps      [rbp+90h],xmm10
+    movdqa      xmm10,xmm9
+    movdqa      xmm6,[rax+rcx]
+    punpcklbw   xmm1,xmm3
+    movaps      [rbp+0A0h],xmm12
+    psubw       xmm13,xmm1
+    movaps      [rbp+40h],xmm15
+    movdqa      xmm15,xmm14
+    movaps      [rbp+10h],xmm7
+    movdqa      xmm7,xmm6
+    punpcklbw   xmm10,xmm3
+    movdqa      xmm12,[r12+rcx]
+    punpcklbw   xmm7,xmm3
+    punpcklbw   xmm12,xmm3
+    punpcklbw   xmm15,xmm3
+    pabsw       xmm3,xmm13
+    movdqa      xmm13,xmm10
+    psubw       xmm13,xmm15
+    movdqa      [rbp+0F0h],xmm15
+    pabsw       xmm15,xmm13
+    movdqa      xmm13,xmm11
+    movdqa      [rbp+0B0h],xmm1
+    movdqa      xmm1,xmm0
+    pavgw       xmm13,xmm10
+    pcmpgtw     xmm1,xmm3
+    movdqa      [rbp+120h],xmm13
+    movaps      xmm13,xmm2
+    punpcklwd   xmm4,xmm4
+    movdqa      xmm3,xmm0
+    movdqa      [rbp+100h],xmm1
+    psubw       xmm13,xmm1
+    movdqa      xmm1,xmm10
+    pcmpgtw     xmm3,xmm15
+    pshufd      xmm4,xmm4,0
+    psubw       xmm1,xmm11
+    movdqa      [rbp+0D0h],xmm10
+    psubw       xmm13,xmm3
+    movdqa      [rbp+110h],xmm3
+    pabsw       xmm15,xmm1
+    movdqa      xmm3,xmm4
+    psubw       xmm10,xmm12
+    pcmpgtw     xmm3,xmm15
+    pabsw       xmm15,xmm10
+    movdqa      xmm10,xmm0
+    psllw       xmm1,2
+    movdqa      [rbp+0C0h],xmm11
+    psubw       xmm11,xmm7
+    pcmpgtw     xmm10,xmm15
+    pabsw       xmm11,xmm11
+    movdqa      xmm15,xmm0
+    pand        xmm3,xmm10
+    pcmpgtw     xmm15,xmm11
+    movaps      xmm11,xmm2
+    pxor        xmm10,xmm10
+    pand        xmm3,xmm15
+    pcmpgtw     xmm11,xmm10
+    pcmpeqw     xmm10,xmm2
+    por         xmm11,xmm10
+    pand        xmm3,xmm11
+    movdqa      xmm11,xmm7
+    psubw       xmm11,xmm12
+    pxor        xmm15,xmm15
+    paddw       xmm11,xmm1
+    psubw       xmm15,xmm13
+    movdqa      [rbp+0E0h],xmm12
+    paddw       xmm11,[FOUR_16B_SSE2]
+    pxor        xmm12,xmm12
+    psraw       xmm11,3
+    punpckhbw   xmm8,xmm12
+    pmaxsw      xmm15,xmm11
+    punpckhbw   xmm5,xmm12
+    movdqa      xmm11,xmm8
+    pminsw      xmm13,xmm15
+    psubw       xmm11,xmm5
+    punpckhbw   xmm9,xmm12
+    pand        xmm13,xmm3
+    movdqa      [rbp+130h],xmm13
+    pabsw       xmm13,xmm11
+    punpckhbw   xmm14,xmm12
+    movdqa      xmm11,xmm9
+    psubw       xmm11,xmm14
+    movdqa      xmm15,xmm0
+    movdqa      [rbp+140h],xmm14
+    pabsw       xmm14,xmm11
+    movdqa      xmm11,xmm8
+    pcmpgtw     xmm15,xmm14
+    movdqa      xmm1,[r12+rcx]
+    pavgw       xmm11,xmm9
+    movdqa      [rbp+170h],xmm11
+    movdqa      xmm10,xmm9
+    punpckhbw   xmm6,xmm12
+    psubw       xmm10,xmm8
+    punpckhbw   xmm1,xmm12
+    movdqa      xmm12,xmm0
+    movaps      xmm11,[rbp+0A0h]
+    pcmpgtw     xmm12,xmm13
+    movaps      xmm13,xmm11
+    psubw       xmm13,xmm12
+    movdqa      [rbp+160h],xmm15
+    psubw       xmm13,xmm15
+    movdqa      xmm15,xmm9
+    psubw       xmm15,xmm1
+    movdqa      [rbp+150h],xmm12
+    pabsw       xmm12,xmm10
+    pabsw       xmm14,xmm15
+    movdqa      xmm15,xmm8
+    pcmpgtw     xmm4,xmm12
+    movdqa      xmm12,xmm0
+    psubw       xmm15,xmm6
+    pcmpgtw     xmm12,xmm14
+    pabsw       xmm14,xmm15
+    psllw       xmm10,2
+    pcmpgtw     xmm0,xmm14
+    movdqa      xmm14,xmm6
+    psubw       xmm14,xmm1
+    pand        xmm4,xmm12
+    paddw       xmm14,xmm10
+    pand        xmm4,xmm0
+    paddw       xmm14,[FOUR_16B_SSE2]
+    pxor        xmm15,xmm15
+    movaps      xmm12,xmm11
+    psubw       xmm15,xmm13
+    pxor        xmm0,xmm0
+    psraw       xmm14,3
+    pcmpgtw     xmm12,xmm0
+    pcmpeqw     xmm0,xmm11
+    pmaxsw      xmm15,xmm14
+    por         xmm12,xmm0
+    movdqa      xmm0,[rbp+120h]
+    pminsw      xmm13,xmm15
+    movdqa      xmm15,[rbp+0B0h]
+    movdqa      xmm10,xmm7
+    pand        xmm4,xmm12
+    paddw       xmm15,xmm0
+    pxor        xmm12,xmm12
+    paddw       xmm10,xmm7
+    movdqa      xmm14,xmm12
+    psubw       xmm15,xmm10
+    psubw       xmm14,xmm2
+    psraw       xmm15,1
+    pmaxsw      xmm15,xmm14
+    movdqa      xmm10,xmm6
+    pminsw      xmm15,xmm2
+    paddw       xmm10,xmm6
+    pand        xmm15,xmm3
+    psubw       xmm12,xmm11
+    pand        xmm15,[rbp+100h]
+    pand        xmm13,xmm4
+    paddw       xmm7,xmm15
+    paddw       xmm8,xmm13
+    movdqa      xmm15,[rbp+170h]
+    psubw       xmm9,xmm13
+    paddw       xmm5,xmm15
+    psubw       xmm5,xmm10
+    psraw       xmm5,1
+    pmaxsw      xmm5,xmm12
+    pminsw      xmm5,xmm11
+    pand        xmm5,xmm4
+    pand        xmm5,[rbp+150h]
+    paddw       xmm6,xmm5
+    movdqa      xmm5,[rbp+0C0h]
+    packuswb    xmm7,xmm6
+    movdqa      xmm6,[rbp+130h]
+    paddw       xmm5,xmm6
+    packuswb    xmm5,xmm8
+    movdqa      xmm8,[rbp+0D0h]
+    psubw       xmm8,xmm6
+    movdqa      xmm6,[rbp+0F0h]
+    paddw       xmm6,xmm0
+    movdqa      xmm0,[rbp+0E0h]
+    packuswb    xmm8,xmm9
+    movdqa      xmm9,xmm0
+    paddw       xmm9,xmm0
+    psubw       xmm6,xmm9
+    psraw       xmm6,1
+    pmaxsw      xmm14,xmm6
+    pminsw      xmm2,xmm14
+    pand        xmm2,xmm3
+    pand        xmm2,[rbp+110h]
+    paddw       xmm0,xmm2
+    movdqa      xmm2,[rbp+140h]
+    paddw       xmm2,xmm15
+    movdqa      xmm15,xmm1
+    paddw       xmm15,xmm1
+    psubw       xmm2,xmm15
+    psraw       xmm2,1
+    pmaxsw      xmm12,xmm2
+    pminsw      xmm11,xmm12
+    pand        xmm11,xmm4
+    pand        xmm11,[rbp+160h]
+    paddw       xmm1,xmm11
+    movdqa      [rax+rcx],xmm7
+    movdqa      [r10],xmm5
+    packuswb    xmm0,xmm1
+    movdqa      [rcx],xmm8
+    movdqa      [r12+rcx],xmm0
+    mov         r12,qword [rbp+180h]
+    lea         rsp,[rbp+190h]
+    POP_XMM
+    pop         rbp
+    ret
 
 
 WELS_EXTERN   DeblockLumaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        rsi
-  push        rdi
-  sub         rsp,1D8h
-  movaps      [rax-38h],xmm6
-  movaps      [rax-48h],xmm7
-  movaps      [rax-58h],xmm8
-  pxor        xmm1,xmm1
-  movsxd      r10,edx
-  mov         rbp,rcx
-  mov         r11d,r8d
-  mov         rdx,rcx
-  mov         rdi,rbp
-  mov         rbx,rbp
-  movdqa      xmm5,[rbp]
-  movaps      [rax-68h],xmm9
-  movaps      [rax-78h],xmm10
-  punpcklbw   xmm5,xmm1
-  movaps      [rax-88h],xmm11
-  movaps      [rax-98h],xmm12
-  movaps      [rax-0A8h],xmm13
-  movaps      [rax-0B8h],xmm14
-  movdqa      xmm14,[r10+rbp]
-  movaps      [rax-0C8h],xmm15
-  lea         eax,[r10*4]
-  movsxd      r8,eax
-  lea         eax,[r10+r10*2]
-  movsxd      rcx,eax
-  lea         eax,[r10+r10]
-  sub         rdx,r8
-  punpcklbw   xmm14,xmm1
-  movdqa      [rsp+90h],xmm5
-  movdqa      [rsp+30h],xmm14
-  movsxd      rsi,eax
-  movsx       eax,r11w
-  sub         rdi,rcx
-  sub         rbx,rsi
-  mov         r8,rbp
-  sub         r8,r10
-  movd        xmm0,eax
-  movsx       eax,r9w
-  movdqa      xmm12,[rdi]
-  movdqa      xmm6, [rsi+rbp]
-  movdqa      xmm13,[rbx]
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm6,xmm1
-  movdqa      xmm8,[r8]
-  movd        xmm0,eax
-  movdqa      xmm10,xmm11
-  mov         eax,2
-  punpcklbw   xmm8,xmm1
-  punpcklbw   xmm12,xmm1
-  cwde
-  punpcklwd   xmm0,xmm0
-  psraw       xmm10,2
-  movdqa      xmm1,xmm8
-  movdqa      [rsp+0F0h],xmm13
-  movdqa      [rsp+0B0h],xmm8
-  pshufd      xmm7,xmm0,0
-  psubw       xmm1,xmm13
-  movdqa      xmm0,xmm5
-  movdqa      xmm4,xmm7
-  movdqa      xmm2,xmm7
-  psubw       xmm0,xmm8
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm5
-  movdqa      [rsp+40h],xmm7
-  movdqa      [rsp+60h],xmm6
-  pcmpgtw     xmm4,xmm0
-  psubw       xmm1,xmm14
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm4,xmm2
-  movdqa      xmm0,xmm11
-  pcmpgtw     xmm0,xmm3
-  pand        xmm4,xmm0
-  movd        xmm0,eax
-  movdqa      [rsp+20h],xmm4
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm2,xmm0,0
-  paddw       xmm10,xmm2
-  movdqa      [rsp+0A0h],xmm2
-  movdqa      xmm15,xmm7
-  pxor        xmm4,xmm4
-  movdqa      xmm0,xmm8
-  psubw       xmm0,xmm12
-  mov         eax,4
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm10
-  cwde
-  pcmpgtw     xmm15,xmm0
-  pcmpgtw     xmm1,xmm3
-  movdqa      xmm3,xmm7
-  movdqa      xmm7,[rdx]
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm6
-  pand        xmm15,xmm1
-  punpcklbw   xmm7,xmm4
-  movdqa      xmm9,xmm15
-  pabsw       xmm0,xmm0
-  psllw       xmm7,1
-  pandn       xmm9,xmm12
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm7,xmm12
-  movd        xmm0,eax
-  pand        xmm3,xmm1
-  paddw       xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  paddw       xmm7,xmm12
-  pshufd      xmm1,xmm0,0
-  paddw       xmm7,xmm13
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm6
-  paddw       xmm7,xmm8
-  movdqa      [rsp+70h],xmm1
-  paddw       xmm7,xmm5
-  movdqa      [rsp+120h],xmm0
-  movdqa      xmm0,[rcx+rbp]
-  punpcklbw   xmm0,xmm4
-  paddw       xmm7,xmm1
-  movdqa      xmm4,xmm15
-  psllw       xmm0,1
-  psraw       xmm7,3
-  paddw       xmm0,xmm6
-  pand        xmm7,xmm15
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm14
-  movdqa      xmm6,xmm15
-  paddw       xmm0,xmm5
-  pandn       xmm6,xmm13
-  paddw       xmm0,xmm8
-  paddw       xmm0,xmm1
-  psraw       xmm0,3
-  movdqa      xmm1,xmm12
-  paddw       xmm1,xmm13
-  pand        xmm0,xmm3
-  movdqa      [rsp+100h],xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,xmm5
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm3
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pandn       xmm0,xmm14
-  pand        xmm4,xmm1
-  movdqa      [rsp+0E0h],xmm0
-  movdqa      xmm0,xmm5
-  paddw       xmm0,xmm8
-  movdqa      xmm1,[rsp+60h]
-  paddw       xmm1,xmm14
-  movdqa      xmm14,xmm3
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,[rsp+30h]
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pand        xmm14,xmm1
-  movdqa      xmm1,xmm13
-  paddw       xmm1,xmm13
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  movdqa      xmm0,[rsp+30h]
-  movdqa      xmm2,xmm13
-  movdqa      xmm5,xmm15
-  paddw       xmm0,[rsp+70h]
-  pandn       xmm5,xmm1
-  paddw       xmm2,xmm8
-  movdqa      xmm8,[rsp+90h]
-  movdqa      xmm1,xmm12
-  paddw       xmm2,xmm8
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,xmm8
-  movdqa      xmm8,xmm3
-  movdqa      xmm2,[rsp+30h]
-  paddw       xmm0,xmm13
-  psraw       xmm1,3
-  pand        xmm15,xmm1
-  movdqa      xmm1,xmm2
-  paddw       xmm1,xmm2
-  paddw       xmm2,[rsp+90h]
-  paddw       xmm2,[rsp+0B0h]
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  movdqa      xmm13,[r8]
-  paddw       xmm0, [rsp+70h]
-  paddw       xmm1, [rsp+0A0h]
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  psraw       xmm1,2
-  movdqa      xmm0, [rdi]
-  pandn       xmm8,xmm1
-  movdqa      xmm1, [rsp+60h]
-  paddw       xmm1,xmm2
-  movdqa      xmm2, [rbx]
-  psraw       xmm1,3
-  pand        xmm3,xmm1
-  movdqa      xmm1, [rbp]
-  movdqa      [rsp+0D0h],xmm3
-  pxor        xmm3,xmm3
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm1,xmm3
-  punpckhbw   xmm13,xmm3
-  movdqa      [rsp+0C0h],xmm0
-  movdqa      xmm0,[r10+rbp]
-  movdqa      [rsp],xmm1
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm2,xmm3
-  movdqa      [rsp+80h],xmm0
-  movdqa      xmm0,[rsi+rbp]
-  movdqa      [rsp+10h],xmm13
-  punpckhbw   xmm0,xmm3
-  movdqa      [rsp+50h],xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm1,xmm13
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm2
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,[rsp]
-  movdqa      xmm13,[rsp+40h]
-  movdqa      [rsp+110h],xmm2
-  psubw       xmm1, [rsp+80h]
-  pcmpgtw     xmm13,xmm0
-  pcmpgtw     xmm11,xmm3
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm10,xmm3
-  movdqa      xmm1, [rsp+40h]
-  movdqa      xmm2,xmm1
-  movdqa      xmm3,xmm1
-  pcmpgtw     xmm2,xmm0
-  movdqa      xmm0, [rsp+10h]
-  pand        xmm13,xmm2
-  pand        xmm13,xmm11
-  movdqa      xmm11,[rsp+0C0h]
-  psubw       xmm0,xmm11
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm3,xmm0
-  pand        xmm3,xmm10
-  movdqa      xmm0,[rsp]
-  psubw       xmm0,[rsp+50h]
-  movdqa      xmm2,[rdx]
-  pabsw       xmm0,xmm0
-  por         xmm7,xmm9
-  movdqa      xmm9,[rsp+20h]
-  pcmpgtw     xmm1,xmm0
-  pand        xmm9,xmm7
-  movdqa      xmm7,[rsp+20h]
-  movdqa      xmm0,xmm7
-  pandn       xmm0,xmm12
-  movdqa      xmm12,[rsp+110h]
-  pand        xmm1,xmm10
-  movdqa      xmm10,[rsp+70h]
-  movdqa      [rsp+40h],xmm1
-  movdqa      xmm1,xmm13
-  por         xmm9,xmm0
-  pxor        xmm0,xmm0
-  por         xmm4,xmm6
-  movdqa      xmm6,xmm7
-  punpckhbw   xmm2,xmm0
-  por         xmm15,xmm5
-  movdqa      xmm5,[rsp+20h]
-  movdqa      xmm0,xmm3
-  psllw       xmm2,1
-  pandn       xmm0,xmm11
-  pand        xmm6,xmm4
-  movdqa      xmm4,[rsp]
-  paddw       xmm2,xmm11
-  pand        xmm5,xmm15
-  movdqa      xmm15,[rsp+20h]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm12
-  paddw       xmm2,[rsp+10h]
-  paddw       xmm2,[rsp]
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  pand        xmm2,xmm3
-  por         xmm2,xmm0
-  pand        xmm1,xmm2
-  movdqa      xmm0,xmm13
-  movdqa      xmm2,xmm11
-  pandn       xmm0,xmm11
-  paddw       xmm2,xmm12
-  por         xmm1,xmm0
-  packuswb    xmm9,xmm1
-  movdqa      xmm0,xmm7
-  movdqa      xmm7,[rsp+0A0h]
-  pandn       xmm0,[rsp+0F0h]
-  movdqa      xmm1,xmm3
-  por         xmm6,xmm0
-  movdqa      xmm0,[rsp+10h]
-  paddw       xmm0,xmm4
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm12
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  pandn       xmm0,xmm12
-  movdqa      xmm1,xmm12
-  paddw       xmm1,[rsp+10h]
-  por         xmm2,xmm0
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+0B0h]
-  paddw       xmm1,xmm4
-  packuswb    xmm6,xmm2
-  movdqa      xmm2,xmm3
-  psllw       xmm1,1
-  por         xmm5,xmm0
-  movdqa      xmm0,[rsp+80h]
-  paddw       xmm0,xmm10
-  paddw       xmm1,xmm0
-  paddw       xmm11,xmm1
-  psraw       xmm11,3
-  movdqa      xmm1,xmm12
-  pand        xmm2,xmm11
-  paddw       xmm1,xmm12
-  movdqa      xmm11,[rsp+80h]
-  movdqa      xmm0, [rsp+10h]
-  por         xmm14,[rsp+0E0h]
-  paddw       xmm0,xmm11
-  movdqa      xmm4,xmm15
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  paddw       xmm1,xmm7
-  psraw       xmm1,2
-  pandn       xmm3,xmm1
-  por         xmm2,xmm3
-  movdqa      xmm1,xmm13
-  movdqa      xmm3,[rsp+10h]
-  pandn       xmm0,xmm3
-  pand        xmm1,xmm2
-  movdqa      xmm2,xmm11
-  paddw       xmm2,[rsp]
-  por         xmm1,xmm0
-  movdqa      xmm0,[rsp+0D0h]
-  por         xmm0,xmm8
-  paddw       xmm2,xmm3
-  packuswb    xmm5,xmm1
-  movdqa      xmm8,[rsp+40h]
-  movdqa      xmm1,[rsp+50h]
-  movdqa      xmm3,xmm8
-  pand        xmm4,xmm0
-  psllw       xmm2,1
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+90h]
-  por         xmm4,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm10
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,[rsp]
-  movdqa      xmm2,xmm11
-  paddw       xmm0,xmm12
-  movdqa      xmm12,[rsp]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm0
-  psraw       xmm1,3
-  movdqa      xmm0,xmm8
-  pand        xmm3,xmm1
-  paddw       xmm2,xmm7
-  movdqa      xmm1,xmm13
-  psraw       xmm2,2
-  pandn       xmm0,xmm2
-  por         xmm3,xmm0
-  movdqa      xmm2,[rsp+50h]
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm3
-  paddw       xmm2,xmm11
-  movdqa      xmm3,xmm15
-  por         xmm1,xmm0
-  pand        xmm3,xmm14
-  movdqa      xmm14,[rsp+10h]
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+30h]
-  packuswb    xmm4,xmm1
-  movdqa      xmm1,xmm8
-  por         xmm3,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm14
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm8
-  pandn       xmm0,xmm11
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm11
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm15
-  por         xmm2,xmm0
-  packuswb    xmm3,xmm2
-  movdqa      xmm0,[rsp+100h]
-  por         xmm0,[rsp+120h]
-  pand        xmm1,xmm0
-  movdqa      xmm2,[rcx+rbp]
-  movdqa      xmm7,[rsp+50h]
-  pandn       xmm15,[rsp+60h]
-  lea         r11,[rsp+1D8h]
-  pxor        xmm0,xmm0
-  por         xmm1,xmm15
-  movaps      xmm15,[r11-0A8h]
-  movdqa      [rdi],xmm9
-  movaps      xmm9,[r11-48h]
-  punpckhbw   xmm2,xmm0
-  psllw       xmm2,1
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm7
-  movdqa      [rbx],xmm6
-  movaps      xmm6,[r11-18h]
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm11
-  movaps      xmm11,[r11-68h]
-  paddw       xmm2,xmm12
-  movaps      xmm12,[r11-78h]
-  paddw       xmm2,xmm14
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  movaps      xmm10,[r11-58h]
-  movaps      xmm14,[r11-98h]
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm8
-  pandn       xmm8,xmm7
-  pandn       xmm13,xmm7
-  por         xmm2,xmm8
-  movaps      xmm7,[r11-28h]
-  movaps      xmm8,[r11-38h]
-  movdqa      [r8],xmm5
-  pand        xmm0,xmm2
-  por         xmm0,xmm13
-  packuswb    xmm1,xmm0
-  movaps      xmm13,[r11-88h]
-  movdqa      [rbp],xmm4
-  movdqa      [r10+rbp],xmm3
-  movdqa      [rsi+rbp],xmm1
-  mov         rsp,r11
-  pop         rdi
-  pop         rsi
-  pop         rbp
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        rsi
+    push        rdi
+    sub         rsp,1D8h
+    movaps      [rax-38h],xmm6
+    movaps      [rax-48h],xmm7
+    movaps      [rax-58h],xmm8
+    pxor        xmm1,xmm1
+    movsxd      r10,edx
+    mov         rbp,rcx
+    mov         r11d,r8d
+    mov         rdx,rcx
+    mov         rdi,rbp
+    mov         rbx,rbp
+    movdqa      xmm5,[rbp]
+    movaps      [rax-68h],xmm9
+    movaps      [rax-78h],xmm10
+    punpcklbw   xmm5,xmm1
+    movaps      [rax-88h],xmm11
+    movaps      [rax-98h],xmm12
+    movaps      [rax-0A8h],xmm13
+    movaps      [rax-0B8h],xmm14
+    movdqa      xmm14,[r10+rbp]
+    movaps      [rax-0C8h],xmm15
+    lea         eax,[r10*4]
+    movsxd      r8,eax
+    lea         eax,[r10+r10*2]
+    movsxd      rcx,eax
+    lea         eax,[r10+r10]
+    sub         rdx,r8
+    punpcklbw   xmm14,xmm1
+    movdqa      [rsp+90h],xmm5
+    movdqa      [rsp+30h],xmm14
+    movsxd      rsi,eax
+    movsx       eax,r11w
+    sub         rdi,rcx
+    sub         rbx,rsi
+    mov         r8,rbp
+    sub         r8,r10
+    movd        xmm0,eax
+    movsx       eax,r9w
+    movdqa      xmm12,[rdi]
+    movdqa      xmm6, [rsi+rbp]
+    movdqa      xmm13,[rbx]
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm6,xmm1
+    movdqa      xmm8,[r8]
+    movd        xmm0,eax
+    movdqa      xmm10,xmm11
+    mov         eax,2
+    punpcklbw   xmm8,xmm1
+    punpcklbw   xmm12,xmm1
+    cwde
+    punpcklwd   xmm0,xmm0
+    psraw       xmm10,2
+    movdqa      xmm1,xmm8
+    movdqa      [rsp+0F0h],xmm13
+    movdqa      [rsp+0B0h],xmm8
+    pshufd      xmm7,xmm0,0
+    psubw       xmm1,xmm13
+    movdqa      xmm0,xmm5
+    movdqa      xmm4,xmm7
+    movdqa      xmm2,xmm7
+    psubw       xmm0,xmm8
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm5
+    movdqa      [rsp+40h],xmm7
+    movdqa      [rsp+60h],xmm6
+    pcmpgtw     xmm4,xmm0
+    psubw       xmm1,xmm14
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm4,xmm2
+    movdqa      xmm0,xmm11
+    pcmpgtw     xmm0,xmm3
+    pand        xmm4,xmm0
+    movd        xmm0,eax
+    movdqa      [rsp+20h],xmm4
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm2,xmm0,0
+    paddw       xmm10,xmm2
+    movdqa      [rsp+0A0h],xmm2
+    movdqa      xmm15,xmm7
+    pxor        xmm4,xmm4
+    movdqa      xmm0,xmm8
+    psubw       xmm0,xmm12
+    mov         eax,4
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm10
+    cwde
+    pcmpgtw     xmm15,xmm0
+    pcmpgtw     xmm1,xmm3
+    movdqa      xmm3,xmm7
+    movdqa      xmm7,[rdx]
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm6
+    pand        xmm15,xmm1
+    punpcklbw   xmm7,xmm4
+    movdqa      xmm9,xmm15
+    pabsw       xmm0,xmm0
+    psllw       xmm7,1
+    pandn       xmm9,xmm12
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm7,xmm12
+    movd        xmm0,eax
+    pand        xmm3,xmm1
+    paddw       xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    paddw       xmm7,xmm12
+    pshufd      xmm1,xmm0,0
+    paddw       xmm7,xmm13
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm6
+    paddw       xmm7,xmm8
+    movdqa      [rsp+70h],xmm1
+    paddw       xmm7,xmm5
+    movdqa      [rsp+120h],xmm0
+    movdqa      xmm0,[rcx+rbp]
+    punpcklbw   xmm0,xmm4
+    paddw       xmm7,xmm1
+    movdqa      xmm4,xmm15
+    psllw       xmm0,1
+    psraw       xmm7,3
+    paddw       xmm0,xmm6
+    pand        xmm7,xmm15
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm14
+    movdqa      xmm6,xmm15
+    paddw       xmm0,xmm5
+    pandn       xmm6,xmm13
+    paddw       xmm0,xmm8
+    paddw       xmm0,xmm1
+    psraw       xmm0,3
+    movdqa      xmm1,xmm12
+    paddw       xmm1,xmm13
+    pand        xmm0,xmm3
+    movdqa      [rsp+100h],xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,xmm5
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm3
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pandn       xmm0,xmm14
+    pand        xmm4,xmm1
+    movdqa      [rsp+0E0h],xmm0
+    movdqa      xmm0,xmm5
+    paddw       xmm0,xmm8
+    movdqa      xmm1,[rsp+60h]
+    paddw       xmm1,xmm14
+    movdqa      xmm14,xmm3
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,[rsp+30h]
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pand        xmm14,xmm1
+    movdqa      xmm1,xmm13
+    paddw       xmm1,xmm13
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    movdqa      xmm0,[rsp+30h]
+    movdqa      xmm2,xmm13
+    movdqa      xmm5,xmm15
+    paddw       xmm0,[rsp+70h]
+    pandn       xmm5,xmm1
+    paddw       xmm2,xmm8
+    movdqa      xmm8,[rsp+90h]
+    movdqa      xmm1,xmm12
+    paddw       xmm2,xmm8
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,xmm8
+    movdqa      xmm8,xmm3
+    movdqa      xmm2,[rsp+30h]
+    paddw       xmm0,xmm13
+    psraw       xmm1,3
+    pand        xmm15,xmm1
+    movdqa      xmm1,xmm2
+    paddw       xmm1,xmm2
+    paddw       xmm2,[rsp+90h]
+    paddw       xmm2,[rsp+0B0h]
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    movdqa      xmm13,[r8]
+    paddw       xmm0, [rsp+70h]
+    paddw       xmm1, [rsp+0A0h]
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    psraw       xmm1,2
+    movdqa      xmm0, [rdi]
+    pandn       xmm8,xmm1
+    movdqa      xmm1, [rsp+60h]
+    paddw       xmm1,xmm2
+    movdqa      xmm2, [rbx]
+    psraw       xmm1,3
+    pand        xmm3,xmm1
+    movdqa      xmm1, [rbp]
+    movdqa      [rsp+0D0h],xmm3
+    pxor        xmm3,xmm3
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm1,xmm3
+    punpckhbw   xmm13,xmm3
+    movdqa      [rsp+0C0h],xmm0
+    movdqa      xmm0,[r10+rbp]
+    movdqa      [rsp],xmm1
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm2,xmm3
+    movdqa      [rsp+80h],xmm0
+    movdqa      xmm0,[rsi+rbp]
+    movdqa      [rsp+10h],xmm13
+    punpckhbw   xmm0,xmm3
+    movdqa      [rsp+50h],xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm1,xmm13
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm2
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,[rsp]
+    movdqa      xmm13,[rsp+40h]
+    movdqa      [rsp+110h],xmm2
+    psubw       xmm1, [rsp+80h]
+    pcmpgtw     xmm13,xmm0
+    pcmpgtw     xmm11,xmm3
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm10,xmm3
+    movdqa      xmm1, [rsp+40h]
+    movdqa      xmm2,xmm1
+    movdqa      xmm3,xmm1
+    pcmpgtw     xmm2,xmm0
+    movdqa      xmm0, [rsp+10h]
+    pand        xmm13,xmm2
+    pand        xmm13,xmm11
+    movdqa      xmm11,[rsp+0C0h]
+    psubw       xmm0,xmm11
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm3,xmm0
+    pand        xmm3,xmm10
+    movdqa      xmm0,[rsp]
+    psubw       xmm0,[rsp+50h]
+    movdqa      xmm2,[rdx]
+    pabsw       xmm0,xmm0
+    por         xmm7,xmm9
+    movdqa      xmm9,[rsp+20h]
+    pcmpgtw     xmm1,xmm0
+    pand        xmm9,xmm7
+    movdqa      xmm7,[rsp+20h]
+    movdqa      xmm0,xmm7
+    pandn       xmm0,xmm12
+    movdqa      xmm12,[rsp+110h]
+    pand        xmm1,xmm10
+    movdqa      xmm10,[rsp+70h]
+    movdqa      [rsp+40h],xmm1
+    movdqa      xmm1,xmm13
+    por         xmm9,xmm0
+    pxor        xmm0,xmm0
+    por         xmm4,xmm6
+    movdqa      xmm6,xmm7
+    punpckhbw   xmm2,xmm0
+    por         xmm15,xmm5
+    movdqa      xmm5,[rsp+20h]
+    movdqa      xmm0,xmm3
+    psllw       xmm2,1
+    pandn       xmm0,xmm11
+    pand        xmm6,xmm4
+    movdqa      xmm4,[rsp]
+    paddw       xmm2,xmm11
+    pand        xmm5,xmm15
+    movdqa      xmm15,[rsp+20h]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm12
+    paddw       xmm2,[rsp+10h]
+    paddw       xmm2,[rsp]
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    pand        xmm2,xmm3
+    por         xmm2,xmm0
+    pand        xmm1,xmm2
+    movdqa      xmm0,xmm13
+    movdqa      xmm2,xmm11
+    pandn       xmm0,xmm11
+    paddw       xmm2,xmm12
+    por         xmm1,xmm0
+    packuswb    xmm9,xmm1
+    movdqa      xmm0,xmm7
+    movdqa      xmm7,[rsp+0A0h]
+    pandn       xmm0,[rsp+0F0h]
+    movdqa      xmm1,xmm3
+    por         xmm6,xmm0
+    movdqa      xmm0,[rsp+10h]
+    paddw       xmm0,xmm4
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm12
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    pandn       xmm0,xmm12
+    movdqa      xmm1,xmm12
+    paddw       xmm1,[rsp+10h]
+    por         xmm2,xmm0
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+0B0h]
+    paddw       xmm1,xmm4
+    packuswb    xmm6,xmm2
+    movdqa      xmm2,xmm3
+    psllw       xmm1,1
+    por         xmm5,xmm0
+    movdqa      xmm0,[rsp+80h]
+    paddw       xmm0,xmm10
+    paddw       xmm1,xmm0
+    paddw       xmm11,xmm1
+    psraw       xmm11,3
+    movdqa      xmm1,xmm12
+    pand        xmm2,xmm11
+    paddw       xmm1,xmm12
+    movdqa      xmm11,[rsp+80h]
+    movdqa      xmm0, [rsp+10h]
+    por         xmm14,[rsp+0E0h]
+    paddw       xmm0,xmm11
+    movdqa      xmm4,xmm15
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    paddw       xmm1,xmm7
+    psraw       xmm1,2
+    pandn       xmm3,xmm1
+    por         xmm2,xmm3
+    movdqa      xmm1,xmm13
+    movdqa      xmm3,[rsp+10h]
+    pandn       xmm0,xmm3
+    pand        xmm1,xmm2
+    movdqa      xmm2,xmm11
+    paddw       xmm2,[rsp]
+    por         xmm1,xmm0
+    movdqa      xmm0,[rsp+0D0h]
+    por         xmm0,xmm8
+    paddw       xmm2,xmm3
+    packuswb    xmm5,xmm1
+    movdqa      xmm8,[rsp+40h]
+    movdqa      xmm1,[rsp+50h]
+    movdqa      xmm3,xmm8
+    pand        xmm4,xmm0
+    psllw       xmm2,1
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+90h]
+    por         xmm4,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm10
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,[rsp]
+    movdqa      xmm2,xmm11
+    paddw       xmm0,xmm12
+    movdqa      xmm12,[rsp]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm0
+    psraw       xmm1,3
+    movdqa      xmm0,xmm8
+    pand        xmm3,xmm1
+    paddw       xmm2,xmm7
+    movdqa      xmm1,xmm13
+    psraw       xmm2,2
+    pandn       xmm0,xmm2
+    por         xmm3,xmm0
+    movdqa      xmm2,[rsp+50h]
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm3
+    paddw       xmm2,xmm11
+    movdqa      xmm3,xmm15
+    por         xmm1,xmm0
+    pand        xmm3,xmm14
+    movdqa      xmm14,[rsp+10h]
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+30h]
+    packuswb    xmm4,xmm1
+    movdqa      xmm1,xmm8
+    por         xmm3,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm14
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm8
+    pandn       xmm0,xmm11
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm11
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm15
+    por         xmm2,xmm0
+    packuswb    xmm3,xmm2
+    movdqa      xmm0,[rsp+100h]
+    por         xmm0,[rsp+120h]
+    pand        xmm1,xmm0
+    movdqa      xmm2,[rcx+rbp]
+    movdqa      xmm7,[rsp+50h]
+    pandn       xmm15,[rsp+60h]
+    lea         r11,[rsp+1D8h]
+    pxor        xmm0,xmm0
+    por         xmm1,xmm15
+    movaps      xmm15,[r11-0A8h]
+    movdqa      [rdi],xmm9
+    movaps      xmm9,[r11-48h]
+    punpckhbw   xmm2,xmm0
+    psllw       xmm2,1
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm7
+    movdqa      [rbx],xmm6
+    movaps      xmm6,[r11-18h]
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm11
+    movaps      xmm11,[r11-68h]
+    paddw       xmm2,xmm12
+    movaps      xmm12,[r11-78h]
+    paddw       xmm2,xmm14
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    movaps      xmm10,[r11-58h]
+    movaps      xmm14,[r11-98h]
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm8
+    pandn       xmm8,xmm7
+    pandn       xmm13,xmm7
+    por         xmm2,xmm8
+    movaps      xmm7,[r11-28h]
+    movaps      xmm8,[r11-38h]
+    movdqa      [r8],xmm5
+    pand        xmm0,xmm2
+    por         xmm0,xmm13
+    packuswb    xmm1,xmm0
+    movaps      xmm13,[r11-88h]
+    movdqa      [rbp],xmm4
+    movdqa      [r10+rbp],xmm3
+    movdqa      [rsi+rbp],xmm1
+    mov         rsp,r11
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+    ret
 
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rdi
-  PUSH_XMM 16
-  sub         rsp,0C8h
-  mov         r10,qword [rax + 30h]  ; pTC
-  pxor        xmm1,xmm1
-  mov         rbx,rcx
-  movsxd      r11,r8d
-  movsx       ecx,byte [r10]
-  movsx       r8d,byte [r10+2]
-  mov         rdi,rdx
-  movq        xmm2,[rbx]
-  movq        xmm9,[r11+rbx]
-  movsx       edx,byte [r10+1]
-  mov         word [rsp+2],cx
-  mov         word [rsp],cx
-  movsx       eax,byte [r10+3]
-  mov         word [rsp+6],dx
-  mov         word [rsp+4],dx
-  movdqa      xmm11,xmm1
-  mov         word [rsp+0Eh],ax
-  mov         word [rsp+0Ch],ax
-  lea         eax,[r11+r11]
-  movsxd      rcx,eax
-  mov         rax,rbx
-  mov         rdx,rdi
-  sub         rax,rcx
-  mov         word [rsp+0Ah],r8w
-  mov         word [rsp+8],r8w
-  movdqa      xmm6,[rsp]
-  movdqa      xmm7,xmm6
-  movq        xmm13, [rax]
-  mov         rax,rdi
-  sub         rax,rcx
-  mov         rcx,rbx
-  pcmpgtw     xmm7,xmm1
-  psubw       xmm11,xmm6
-  sub         rcx,r11
-  sub         rdx,r11
-  movq        xmm0,[rax]
-  movsx       eax,r9w
-  movq        xmm15,[rcx]
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rdx]
-  movdqa      xmm4,xmm13
-  punpcklqdq  xmm15,xmm0
-  movq        xmm0, [rdi]
-  punpcklbw   xmm4,xmm1
-  movdqa      xmm12,xmm15
-  punpcklqdq  xmm2,xmm0
-  movq        xmm0, [r11+rdi]
-  punpcklbw   xmm12,xmm1
-  movdqa      xmm14,xmm2
-  punpcklqdq  xmm9,xmm0
-  punpckhbw   xmm2,xmm1
-  punpcklbw   xmm14,xmm1
-  movd        xmm0,eax
-  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
-  punpckhbw   xmm13,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm9
-  movdqa      [rsp+10h],xmm2
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm9,xmm1
-  punpcklbw   xmm3,xmm1
-  movdqa      xmm1,xmm14
-  pshufd      xmm10,xmm0,0
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm8,xmm0,0
-  movd        xmm0,eax
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  psubw       xmm1,xmm12
-  movdqa      xmm2,xmm10
-  lea         r11,[rsp+0C8h]
-  psllw       xmm1,2
-  movdqa      xmm0,xmm4
-  psubw       xmm4,xmm12
-  psubw       xmm0,xmm3
-  psubw       xmm3,xmm14
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm11
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm12
-  psubw       xmm0,xmm14
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  movdqa      xmm3,[rsp]
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm9
-  psubw       xmm13,xmm15
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  paddw       xmm12,xmm6
-  psubw       xmm14,xmm6
-  movdqa      xmm2,[rsp+10h]
-  movaps      xmm6,[r11-18h]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm15
-  psubw       xmm9,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm15
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  pmaxsw      xmm11,xmm1
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm10,xmm0
-  pabsw       xmm0,xmm13
-  pminsw      xmm3,xmm11
-  movaps      xmm11,[r11-68h]
-  movaps      xmm13,[rsp+40h]
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm9
-  movaps      xmm9, [r11-48h]
-  pand        xmm10,xmm1
-  pcmpgtw     xmm8,xmm0
-  pand        xmm10,xmm8
-  pand        xmm10,xmm7
-  movaps      xmm8,[r11-38h]
-  movaps      xmm7,[r11-28h]
-  pand        xmm3,xmm10
-  paddw       xmm15,xmm3
-  psubw       xmm2,xmm3
-  movaps      xmm10,[r11-58h]
-  packuswb    xmm12,xmm15
-  movaps      xmm15,[rsp+20h]
-  packuswb    xmm14,xmm2
-  movq        [rcx],xmm12
-  movq        [rbx],xmm14
-  psrldq      xmm12,8
-  psrldq      xmm14,8
-  movq        [rdx],xmm12
-  movaps      xmm12,[r11-78h]
-  movq        [rdi],xmm14
-  movaps      xmm14,[rsp+30h]
-  mov         rsp,r11
-  POP_XMM
-  pop         rdi
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rdi
+    PUSH_XMM 16
+    sub         rsp,0C8h
+    mov         r10,qword [rax + 30h]  ; pTC
+    pxor        xmm1,xmm1
+    mov         rbx,rcx
+    movsxd      r11,r8d
+    movsx       ecx,byte [r10]
+    movsx       r8d,byte [r10+2]
+    mov         rdi,rdx
+    movq        xmm2,[rbx]
+    movq        xmm9,[r11+rbx]
+    movsx       edx,byte [r10+1]
+    mov         word [rsp+2],cx
+    mov         word [rsp],cx
+    movsx       eax,byte [r10+3]
+    mov         word [rsp+6],dx
+    mov         word [rsp+4],dx
+    movdqa      xmm11,xmm1
+    mov         word [rsp+0Eh],ax
+    mov         word [rsp+0Ch],ax
+    lea         eax,[r11+r11]
+    movsxd      rcx,eax
+    mov         rax,rbx
+    mov         rdx,rdi
+    sub         rax,rcx
+    mov         word [rsp+0Ah],r8w
+    mov         word [rsp+8],r8w
+    movdqa      xmm6,[rsp]
+    movdqa      xmm7,xmm6
+    movq        xmm13, [rax]
+    mov         rax,rdi
+    sub         rax,rcx
+    mov         rcx,rbx
+    pcmpgtw     xmm7,xmm1
+    psubw       xmm11,xmm6
+    sub         rcx,r11
+    sub         rdx,r11
+    movq        xmm0,[rax]
+    movsx       eax,r9w
+    movq        xmm15,[rcx]
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rdx]
+    movdqa      xmm4,xmm13
+    punpcklqdq  xmm15,xmm0
+    movq        xmm0, [rdi]
+    punpcklbw   xmm4,xmm1
+    movdqa      xmm12,xmm15
+    punpcklqdq  xmm2,xmm0
+    movq        xmm0, [r11+rdi]
+    punpcklbw   xmm12,xmm1
+    movdqa      xmm14,xmm2
+    punpcklqdq  xmm9,xmm0
+    punpckhbw   xmm2,xmm1
+    punpcklbw   xmm14,xmm1
+    movd        xmm0,eax
+    movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
+    punpckhbw   xmm13,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm9
+    movdqa      [rsp+10h],xmm2
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm9,xmm1
+    punpcklbw   xmm3,xmm1
+    movdqa      xmm1,xmm14
+    pshufd      xmm10,xmm0,0
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm8,xmm0,0
+    movd        xmm0,eax
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    psubw       xmm1,xmm12
+    movdqa      xmm2,xmm10
+    lea         r11,[rsp+0C8h]
+    psllw       xmm1,2
+    movdqa      xmm0,xmm4
+    psubw       xmm4,xmm12
+    psubw       xmm0,xmm3
+    psubw       xmm3,xmm14
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm11
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm12
+    psubw       xmm0,xmm14
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    movdqa      xmm3,[rsp]
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm9
+    psubw       xmm13,xmm15
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    paddw       xmm12,xmm6
+    psubw       xmm14,xmm6
+    movdqa      xmm2,[rsp+10h]
+    movaps      xmm6,[r11-18h]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm15
+    psubw       xmm9,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm15
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    pmaxsw      xmm11,xmm1
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm10,xmm0
+    pabsw       xmm0,xmm13
+    pminsw      xmm3,xmm11
+    movaps      xmm11,[r11-68h]
+    movaps      xmm13,[rsp+40h]
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm9
+    movaps      xmm9, [r11-48h]
+    pand        xmm10,xmm1
+    pcmpgtw     xmm8,xmm0
+    pand        xmm10,xmm8
+    pand        xmm10,xmm7
+    movaps      xmm8,[r11-38h]
+    movaps      xmm7,[r11-28h]
+    pand        xmm3,xmm10
+    paddw       xmm15,xmm3
+    psubw       xmm2,xmm3
+    movaps      xmm10,[r11-58h]
+    packuswb    xmm12,xmm15
+    movaps      xmm15,[rsp+20h]
+    packuswb    xmm14,xmm2
+    movq        [rcx],xmm12
+    movq        [rbx],xmm14
+    psrldq      xmm12,8
+    psrldq      xmm14,8
+    movq        [rdx],xmm12
+    movaps      xmm12,[r11-78h]
+    movq        [rdi],xmm14
+    movaps      xmm14,[rsp+30h]
+    mov         rsp,r11
+    POP_XMM
+    pop         rdi
+    pop         rbx
+    ret
 
 
 WELS_EXTERN   DeblockChromaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  PUSH_XMM 15
-  sub         rsp,90h
-  pxor        xmm1,xmm1
-  mov         r11,rcx
-  mov         rbx,rdx
-  mov         r10d,r9d
-  movq        xmm13,[r11]
-  lea         eax,[r8+r8]
-  movsxd      r9,eax
-  mov         rax,rcx
-  sub         rax,r9
-  movq        xmm14,[rax]
-  mov         rax,rdx
-  sub         rax,r9
-  movq        xmm0,[rax]
-  movsxd      rax,r8d
-  sub         rcx,rax
-  sub         rdx,rax
-  movq        xmm12,[rax+r11]
-  movq        xmm10,[rcx]
-  punpcklqdq  xmm14,xmm0
-  movdqa      xmm8,xmm14
-  movq        xmm0,[rdx]
-  punpcklbw   xmm8,xmm1
-  punpckhbw   xmm14,xmm1
-  punpcklqdq  xmm10,xmm0
-  movq        xmm0,[rbx]
-  movdqa      xmm5,xmm10
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rax+rbx]
-  punpcklbw   xmm5,xmm1
-  movsx       eax,r10w
-  movdqa      xmm9,xmm13
-  punpcklqdq  xmm12,xmm0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm10,xmm1
-  movd        xmm0,eax
-  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
-  punpckhbw   xmm13,xmm1
-  movdqa      xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm12,xmm1
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm7,xmm1
-  movd        xmm0,eax
-  movdqa      xmm1,xmm8
-  psubw       xmm1,xmm5
-  punpcklwd   xmm0,xmm0
-  movdqa      xmm6,xmm11
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm9
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm10
-  movdqa      xmm1,xmm14
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm10
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm11,xmm0
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm11,xmm2
-  movdqa      xmm0,xmm12
-  movdqa      xmm4,xmm6
-  movdqa      xmm1,xmm8
-  mov         eax,2
-  cwde
-  paddw       xmm1,xmm8
-  psubw       xmm0,xmm13
-  paddw       xmm1,xmm5
-  pabsw       xmm0,xmm0
-  movdqa      xmm2,xmm14
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm14
-  movd        xmm0,eax
-  pand        xmm11,xmm3
-  paddw       xmm7,xmm7
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  paddw       xmm2,xmm12
-  paddw       xmm12,xmm12
-  pshufd      xmm3,xmm0,0
-  paddw       xmm7,xmm9
-  paddw       xmm12,xmm13
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm5
-  paddw       xmm7,xmm8
-  psraw       xmm1,2
-  paddw       xmm12,xmm14
-  paddw       xmm7,xmm3
-  movaps      xmm14,[rsp]
-  pand        xmm4,xmm1
-  paddw       xmm12,xmm3
-  psraw       xmm7,2
-  movdqa      xmm1,xmm11
-  por         xmm4,xmm0
-  psraw       xmm12,2
-  paddw       xmm2,xmm3
-  movdqa      xmm0,xmm11
-  pandn       xmm0,xmm10
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  packuswb    xmm4,xmm1
-  movdqa      xmm0,xmm11
-  movdqa      xmm1,xmm6
-  pand        xmm1,xmm7
-  movaps      xmm7,[rsp+70h]
-  movq        [rcx],xmm4
-  pandn       xmm6,xmm9
-  pandn       xmm11,xmm13
-  pand        xmm0,xmm12
-  por         xmm1,xmm6
-  por         xmm0,xmm11
-  psrldq      xmm4,8
-  packuswb    xmm1,xmm0
-  movq        [r11],xmm1
-  psrldq      xmm1,8
-  movq        [rdx],xmm4
-  lea         r11,[rsp+90h]
-  movaps      xmm6,[r11-10h]
-  movaps      xmm8,[r11-30h]
-  movaps      xmm9,[r11-40h]
-  movq        [rbx],xmm1
-  movaps      xmm10,[r11-50h]
-  movaps      xmm11,[r11-60h]
-  movaps      xmm12,[r11-70h]
-  movaps      xmm13,[r11-80h]
-  mov         rsp,r11
-  POP_XMM
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    PUSH_XMM 15
+    sub         rsp,90h
+    pxor        xmm1,xmm1
+    mov         r11,rcx
+    mov         rbx,rdx
+    mov         r10d,r9d
+    movq        xmm13,[r11]
+    lea         eax,[r8+r8]
+    movsxd      r9,eax
+    mov         rax,rcx
+    sub         rax,r9
+    movq        xmm14,[rax]
+    mov         rax,rdx
+    sub         rax,r9
+    movq        xmm0,[rax]
+    movsxd      rax,r8d
+    sub         rcx,rax
+    sub         rdx,rax
+    movq        xmm12,[rax+r11]
+    movq        xmm10,[rcx]
+    punpcklqdq  xmm14,xmm0
+    movdqa      xmm8,xmm14
+    movq        xmm0,[rdx]
+    punpcklbw   xmm8,xmm1
+    punpckhbw   xmm14,xmm1
+    punpcklqdq  xmm10,xmm0
+    movq        xmm0,[rbx]
+    movdqa      xmm5,xmm10
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rax+rbx]
+    punpcklbw   xmm5,xmm1
+    movsx       eax,r10w
+    movdqa      xmm9,xmm13
+    punpcklqdq  xmm12,xmm0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm10,xmm1
+    movd        xmm0,eax
+    movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
+    punpckhbw   xmm13,xmm1
+    movdqa      xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm12,xmm1
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm7,xmm1
+    movd        xmm0,eax
+    movdqa      xmm1,xmm8
+    psubw       xmm1,xmm5
+    punpcklwd   xmm0,xmm0
+    movdqa      xmm6,xmm11
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm9
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm10
+    movdqa      xmm1,xmm14
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm10
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm11,xmm0
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm11,xmm2
+    movdqa      xmm0,xmm12
+    movdqa      xmm4,xmm6
+    movdqa      xmm1,xmm8
+    mov         eax,2
+    cwde
+    paddw       xmm1,xmm8
+    psubw       xmm0,xmm13
+    paddw       xmm1,xmm5
+    pabsw       xmm0,xmm0
+    movdqa      xmm2,xmm14
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm14
+    movd        xmm0,eax
+    pand        xmm11,xmm3
+    paddw       xmm7,xmm7
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    paddw       xmm2,xmm12
+    paddw       xmm12,xmm12
+    pshufd      xmm3,xmm0,0
+    paddw       xmm7,xmm9
+    paddw       xmm12,xmm13
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm5
+    paddw       xmm7,xmm8
+    psraw       xmm1,2
+    paddw       xmm12,xmm14
+    paddw       xmm7,xmm3
+    movaps      xmm14,[rsp]
+    pand        xmm4,xmm1
+    paddw       xmm12,xmm3
+    psraw       xmm7,2
+    movdqa      xmm1,xmm11
+    por         xmm4,xmm0
+    psraw       xmm12,2
+    paddw       xmm2,xmm3
+    movdqa      xmm0,xmm11
+    pandn       xmm0,xmm10
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    packuswb    xmm4,xmm1
+    movdqa      xmm0,xmm11
+    movdqa      xmm1,xmm6
+    pand        xmm1,xmm7
+    movaps      xmm7,[rsp+70h]
+    movq        [rcx],xmm4
+    pandn       xmm6,xmm9
+    pandn       xmm11,xmm13
+    pand        xmm0,xmm12
+    por         xmm1,xmm6
+    por         xmm0,xmm11
+    psrldq      xmm4,8
+    packuswb    xmm1,xmm0
+    movq        [r11],xmm1
+    psrldq      xmm1,8
+    movq        [rdx],xmm4
+    lea         r11,[rsp+90h]
+    movaps      xmm6,[r11-10h]
+    movaps      xmm8,[r11-30h]
+    movaps      xmm9,[r11-40h]
+    movq        [rbx],xmm1
+    movaps      xmm10,[r11-50h]
+    movaps      xmm11,[r11-60h]
+    movaps      xmm12,[r11-70h]
+    movaps      xmm13,[r11-80h]
+    mov         rsp,r11
+    POP_XMM
+    pop         rbx
+    ret
 
 
 
@@ -1089,548 +1089,548 @@
 
 
 WELS_EXTERN   DeblockChromaEq4H_ssse3
-  mov         rax,rsp
-  mov         [rax+20h],rbx
-  push        rdi
-  PUSH_XMM 16
-  sub         rsp,140h
-  mov         rdi,rdx
-  lea         eax,[r8*4]
-  movsxd      r10,eax
-  mov         eax,[rcx-2]
-  mov         [rsp+10h],eax
-  lea         rbx,[r10+rdx-2]
-  lea         r11,[r10+rcx-2]
-  movdqa      xmm5,[rsp+10h]
-  movsxd      r10,r8d
-  mov         eax,[r10+rcx-2]
-  lea         rdx,[r10+r10*2]
-  mov         [rsp+20h],eax
-  mov         eax,[rcx+r10*2-2]
-  mov         [rsp+30h],eax
-  mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h]
-  mov         [rsp+40h],eax
-  mov         eax, [rdi-2]
-  movdqa      xmm4,[rsp+30h]
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rdi-2]
-  movdqa      xmm3,[rsp+40h]
-  mov         [rsp+60h],eax
-  mov         eax,[rdi+r10*2-2]
-  punpckldq   xmm5,[rsp+50h]
-  mov         [rsp+70h],eax
-  mov         eax, [rdx+rdi-2]
-  punpckldq   xmm2, [rsp+60h]
-  mov          [rsp+80h],eax
-  mov         eax,[r11]
-  punpckldq   xmm4, [rsp+70h]
-  mov         [rsp+50h],eax
-  mov         eax,[rbx]
-  punpckldq   xmm3,[rsp+80h]
-  mov         [rsp+60h],eax
-  mov         eax,[r10+r11]
-  movdqa      xmm0, [rsp+50h]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm0,[rsp+50h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+60h],eax
-  mov         eax,[r11+r10*2]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[rbx+r10*2]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  mov         eax, [rdx+r11]
-  movdqa      xmm15,xmm1
-  punpckldq   xmm0,[rsp+60h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax, [rdx+rbx]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm15,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm12,xmm15
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm12,xmm0
-  punpckhdq   xmm15,xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm11,xmm12
-  punpckldq   xmm0,xmm5
-  punpckhdq   xmm1,xmm5
-  punpcklqdq  xmm11,xmm0
-  punpckhqdq  xmm12,xmm0
-  movsx       eax,r9w
-  movdqa      xmm14,xmm15
-  punpcklqdq  xmm14,xmm1
-  punpckhqdq  xmm15,xmm1
-  pxor        xmm1,xmm1
-  movd        xmm0,eax
-  movdqa      xmm4,xmm12
-  movdqa      xmm8,xmm11
-  movsx       eax,word [rsp+170h + 160] ; iBeta
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm4,xmm1
-  punpckhbw   xmm12,xmm1
-  movdqa      xmm9,xmm14
-  movdqa      xmm7,xmm15
-  movdqa      xmm10,xmm15
-  pshufd      xmm13,xmm0,0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm14,xmm1
-  movdqa      xmm6,xmm13
-  movd        xmm0,eax
-  movdqa      [rsp],xmm11
-  mov         eax,2
-  cwde
-  punpckhbw   xmm11,xmm1
-  punpckhbw   xmm10,xmm1
-  punpcklbw   xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm8,xmm1
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm4
-  psubw       xmm0,xmm9
-  psubw       xmm1,xmm4
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm12
-  movdqa      xmm1,xmm11
-  psubw       xmm0,xmm14
-  psubw       xmm1,xmm12
-  movdqa      xmm5,xmm6
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm13,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm2,xmm0
-  paddw       xmm1,xmm8
-  movdqa      xmm0,xmm10
-  pand        xmm13,xmm2
-  psubw       xmm0,xmm14
-  paddw       xmm1,xmm4
-  movdqa      xmm2,xmm11
-  pabsw       xmm0,xmm0
-  paddw       xmm2,xmm11
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm12
-  movd        xmm0,eax
-  pand        xmm13,xmm3
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm4
-  paddw       xmm2,xmm3
-  psraw       xmm1,2
-  pand        xmm5,xmm1
-  por         xmm5,xmm0
-  paddw       xmm7,xmm7
-  paddw       xmm10,xmm10
-  psraw       xmm2,2
-  movdqa      xmm1,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm2
-  paddw       xmm7,xmm9
-  por         xmm1,xmm0
-  paddw       xmm10,xmm14
-  paddw       xmm7,xmm8
-  movdqa      xmm0,xmm13
-  packuswb    xmm5,xmm1
-  paddw       xmm7,xmm3
-  paddw       xmm10,xmm11
-  movdqa      xmm1,xmm6
-  paddw       xmm10,xmm3
-  pandn       xmm6,xmm9
-  psraw       xmm7,2
-  pand        xmm1,xmm7
-  psraw       xmm10,2
-  pandn       xmm13,xmm14
-  pand        xmm0,xmm10
-  por         xmm1,xmm6
-  movdqa      xmm6,[rsp]
-  movdqa      xmm4,xmm6
-  por         xmm0,xmm13
-  punpcklbw   xmm4,xmm5
-  punpckhbw   xmm6,xmm5
-  movdqa      xmm3,xmm4
-  packuswb    xmm1,xmm0
-  movdqa      xmm0,xmm1
-  punpckhbw   xmm1,xmm15
-  punpcklbw   xmm0,xmm15
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm6
-  movdqa      xmm2,xmm3
-  punpcklwd   xmm0,xmm1
-  punpckhwd   xmm6,xmm1
-  movdqa      xmm1,xmm4
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm6
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm6
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+10h],xmm0
-  movdqa      [rsp+60h],xmm2
-  movdqa      xmm0,xmm3
-  mov         eax,[rsp+10h]
-  mov         [rcx-2],eax
-  mov         eax,[rsp+60h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [r10+rcx-2],eax
-  movdqa      [rsp+20h],xmm0
-  mov         eax, [rsp+20h]
-  movdqa      [rsp+70h],xmm3
-  mov         [rcx+r10*2-2],eax
-  mov         eax,[rsp+70h]
-  mov         [rdx+rcx-2],eax
-  mov         eax,[rsp+18h]
-  mov         [r11],eax
-  mov         eax,[rsp+68h]
-  mov         [r10+r11],eax
-  mov         eax,[rsp+28h]
-  mov         [r11+r10*2],eax
-  mov         eax,[rsp+78h]
-  mov         [rdx+r11],eax
-  mov         eax,[rsp+14h]
-  mov         [rdi-2],eax
-  mov         eax,[rsp+64h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+24h]
-  mov         [rdi+r10*2-2],eax
-  mov         eax, [rsp+74h]
-  mov         [rdx+rdi-2],eax
-  mov         eax, [rsp+1Ch]
-  mov         [rbx],eax
-  mov         eax, [rsp+6Ch]
-  mov         [r10+rbx],eax
-  mov         eax,[rsp+2Ch]
-  mov         [rbx+r10*2],eax
-  mov         eax,[rsp+7Ch]
-  mov         [rdx+rbx],eax
-  lea         rsp,[rsp+140h]
-  POP_XMM
-  mov         rbx, [rsp+28h]
-  pop         rdi
-  ret
+    mov         rax,rsp
+    mov         [rax+20h],rbx
+    push        rdi
+    PUSH_XMM 16
+    sub         rsp,140h
+    mov         rdi,rdx
+    lea         eax,[r8*4]
+    movsxd      r10,eax
+    mov         eax,[rcx-2]
+    mov         [rsp+10h],eax
+    lea         rbx,[r10+rdx-2]
+    lea         r11,[r10+rcx-2]
+    movdqa      xmm5,[rsp+10h]
+    movsxd      r10,r8d
+    mov         eax,[r10+rcx-2]
+    lea         rdx,[r10+r10*2]
+    mov         [rsp+20h],eax
+    mov         eax,[rcx+r10*2-2]
+    mov         [rsp+30h],eax
+    mov         eax,[rdx+rcx-2]
+    movdqa      xmm2,[rsp+20h]
+    mov         [rsp+40h],eax
+    mov         eax, [rdi-2]
+    movdqa      xmm4,[rsp+30h]
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rdi-2]
+    movdqa      xmm3,[rsp+40h]
+    mov         [rsp+60h],eax
+    mov         eax,[rdi+r10*2-2]
+    punpckldq   xmm5,[rsp+50h]
+    mov         [rsp+70h],eax
+    mov         eax, [rdx+rdi-2]
+    punpckldq   xmm2, [rsp+60h]
+    mov          [rsp+80h],eax
+    mov         eax,[r11]
+    punpckldq   xmm4, [rsp+70h]
+    mov         [rsp+50h],eax
+    mov         eax,[rbx]
+    punpckldq   xmm3,[rsp+80h]
+    mov         [rsp+60h],eax
+    mov         eax,[r10+r11]
+    movdqa      xmm0, [rsp+50h]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm0,[rsp+50h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+60h],eax
+    mov         eax,[r11+r10*2]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[rbx+r10*2]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    mov         eax, [rdx+r11]
+    movdqa      xmm15,xmm1
+    punpckldq   xmm0,[rsp+60h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax, [rdx+rbx]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm15,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm12,xmm15
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm12,xmm0
+    punpckhdq   xmm15,xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm11,xmm12
+    punpckldq   xmm0,xmm5
+    punpckhdq   xmm1,xmm5
+    punpcklqdq  xmm11,xmm0
+    punpckhqdq  xmm12,xmm0
+    movsx       eax,r9w
+    movdqa      xmm14,xmm15
+    punpcklqdq  xmm14,xmm1
+    punpckhqdq  xmm15,xmm1
+    pxor        xmm1,xmm1
+    movd        xmm0,eax
+    movdqa      xmm4,xmm12
+    movdqa      xmm8,xmm11
+    movsx       eax,word [rsp+170h + 160] ; iBeta
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm4,xmm1
+    punpckhbw   xmm12,xmm1
+    movdqa      xmm9,xmm14
+    movdqa      xmm7,xmm15
+    movdqa      xmm10,xmm15
+    pshufd      xmm13,xmm0,0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm14,xmm1
+    movdqa      xmm6,xmm13
+    movd        xmm0,eax
+    movdqa      [rsp],xmm11
+    mov         eax,2
+    cwde
+    punpckhbw   xmm11,xmm1
+    punpckhbw   xmm10,xmm1
+    punpcklbw   xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm8,xmm1
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm4
+    psubw       xmm0,xmm9
+    psubw       xmm1,xmm4
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm12
+    movdqa      xmm1,xmm11
+    psubw       xmm0,xmm14
+    psubw       xmm1,xmm12
+    movdqa      xmm5,xmm6
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm13,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm2,xmm0
+    paddw       xmm1,xmm8
+    movdqa      xmm0,xmm10
+    pand        xmm13,xmm2
+    psubw       xmm0,xmm14
+    paddw       xmm1,xmm4
+    movdqa      xmm2,xmm11
+    pabsw       xmm0,xmm0
+    paddw       xmm2,xmm11
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm12
+    movd        xmm0,eax
+    pand        xmm13,xmm3
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm4
+    paddw       xmm2,xmm3
+    psraw       xmm1,2
+    pand        xmm5,xmm1
+    por         xmm5,xmm0
+    paddw       xmm7,xmm7
+    paddw       xmm10,xmm10
+    psraw       xmm2,2
+    movdqa      xmm1,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm2
+    paddw       xmm7,xmm9
+    por         xmm1,xmm0
+    paddw       xmm10,xmm14
+    paddw       xmm7,xmm8
+    movdqa      xmm0,xmm13
+    packuswb    xmm5,xmm1
+    paddw       xmm7,xmm3
+    paddw       xmm10,xmm11
+    movdqa      xmm1,xmm6
+    paddw       xmm10,xmm3
+    pandn       xmm6,xmm9
+    psraw       xmm7,2
+    pand        xmm1,xmm7
+    psraw       xmm10,2
+    pandn       xmm13,xmm14
+    pand        xmm0,xmm10
+    por         xmm1,xmm6
+    movdqa      xmm6,[rsp]
+    movdqa      xmm4,xmm6
+    por         xmm0,xmm13
+    punpcklbw   xmm4,xmm5
+    punpckhbw   xmm6,xmm5
+    movdqa      xmm3,xmm4
+    packuswb    xmm1,xmm0
+    movdqa      xmm0,xmm1
+    punpckhbw   xmm1,xmm15
+    punpcklbw   xmm0,xmm15
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm6
+    movdqa      xmm2,xmm3
+    punpcklwd   xmm0,xmm1
+    punpckhwd   xmm6,xmm1
+    movdqa      xmm1,xmm4
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm6
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm6
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+10h],xmm0
+    movdqa      [rsp+60h],xmm2
+    movdqa      xmm0,xmm3
+    mov         eax,[rsp+10h]
+    mov         [rcx-2],eax
+    mov         eax,[rsp+60h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [r10+rcx-2],eax
+    movdqa      [rsp+20h],xmm0
+    mov         eax, [rsp+20h]
+    movdqa      [rsp+70h],xmm3
+    mov         [rcx+r10*2-2],eax
+    mov         eax,[rsp+70h]
+    mov         [rdx+rcx-2],eax
+    mov         eax,[rsp+18h]
+    mov         [r11],eax
+    mov         eax,[rsp+68h]
+    mov         [r10+r11],eax
+    mov         eax,[rsp+28h]
+    mov         [r11+r10*2],eax
+    mov         eax,[rsp+78h]
+    mov         [rdx+r11],eax
+    mov         eax,[rsp+14h]
+    mov         [rdi-2],eax
+    mov         eax,[rsp+64h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+24h]
+    mov         [rdi+r10*2-2],eax
+    mov         eax, [rsp+74h]
+    mov         [rdx+rdi-2],eax
+    mov         eax, [rsp+1Ch]
+    mov         [rbx],eax
+    mov         eax, [rsp+6Ch]
+    mov         [r10+rbx],eax
+    mov         eax,[rsp+2Ch]
+    mov         [rbx+r10*2],eax
+    mov         eax,[rsp+7Ch]
+    mov         [rdx+rbx],eax
+    lea         rsp,[rsp+140h]
+    POP_XMM
+    mov         rbx, [rsp+28h]
+    pop         rdi
+    ret
 
 
 
 WELS_EXTERN DeblockChromaLt4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        rsi
-  push        rdi
-  push        r12
-  PUSH_XMM 16
-  sub         rsp,170h
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        rsi
+    push        rdi
+    push        r12
+    PUSH_XMM 16
+    sub         rsp,170h
 
-  movsxd      rsi,r8d
-  lea         eax,[r8*4]
-  mov         r11d,r9d
-  movsxd      r10,eax
-  mov         eax, [rcx-2]
-  mov         r12,rdx
-  mov         [rsp+40h],eax
-  mov         eax, [rsi+rcx-2]
-  lea         rbx,[r10+rcx-2]
-  movdqa      xmm5,[rsp+40h]
-  mov         [rsp+50h],eax
-  mov         eax, [rcx+rsi*2-2]
-  lea         rbp,[r10+rdx-2]
-  movdqa      xmm2, [rsp+50h]
-  mov         [rsp+60h],eax
-  lea         r10,[rsi+rsi*2]
-  mov         rdi,rcx
-  mov         eax,[r10+rcx-2]
-  movdqa      xmm4,[rsp+60h]
-  mov         [rsp+70h],eax
-  mov         eax,[rdx-2]
-  mov         [rsp+80h],eax
-  mov         eax, [rsi+rdx-2]
-  movdqa      xmm3,[rsp+70h]
-  mov         [rsp+90h],eax
-  mov         eax,[rdx+rsi*2-2]
-  punpckldq   xmm5,[rsp+80h]
-  mov         [rsp+0A0h],eax
-  mov         eax, [r10+rdx-2]
-  punpckldq   xmm2,[rsp+90h]
-  mov         [rsp+0B0h],eax
-  mov         eax, [rbx]
-  punpckldq   xmm4,[rsp+0A0h]
-  mov         [rsp+80h],eax
-  mov         eax,[rbp]
-  punpckldq   xmm3,[rsp+0B0h]
-  mov         [rsp+90h],eax
-  mov         eax,[rsi+rbx]
-  movdqa      xmm0,[rsp+80h]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rsi+rbp]
-  movdqa      xmm0,[rsp+80h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+90h],eax
-  mov         eax,[rbx+rsi*2]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rbp+rsi*2]
-  movdqa      xmm0, [rsp+80h]
-  mov         [rsp+90h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm7,xmm1
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax, [r10+rbp]
-  movdqa      xmm0,[rsp+80h]
-  mov         [rsp+90h],eax
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm7,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm6,xmm7
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm6,xmm0
-  punpckhdq   xmm7,xmm0
-  movdqa      xmm0,xmm1
-  punpckldq   xmm0,xmm5
-  mov         rax, [rsp+1C8h+160]    ; pTC
-  punpckhdq   xmm1,xmm5
-  movdqa      xmm9,xmm6
-  punpckhqdq  xmm6,xmm0
-  punpcklqdq  xmm9,xmm0
-  movdqa      xmm2,xmm7
-  movdqa      xmm13,xmm6
-  movdqa      xmm4,xmm9
-  movdqa      [rsp+10h],xmm9
-  punpcklqdq  xmm2,xmm1
-  punpckhqdq  xmm7,xmm1
-  pxor        xmm1,xmm1
-  movsx       ecx,byte [rax+3]
-  movsx       edx,byte [rax+2]
-  movsx       r8d,byte [rax+1]
-  movsx       r9d,byte [rax]
-  movdqa      xmm10,xmm1
-  movdqa      xmm15,xmm2
-  punpckhbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm4,xmm1
-  movsx       eax,r11w
-  mov         word [rsp+0Eh],cx
-  mov         word [rsp+0Ch],cx
-  movdqa      xmm3,xmm7
-  movdqa      xmm8,xmm7
-  movdqa      [rsp+20h],xmm7
-  punpcklbw   xmm15,xmm1
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm3,xmm1
-  mov         word [rsp+0Ah],dx
-  mov         word [rsp+8],dx
-  mov         word [rsp+6],r8w
-  movd        xmm0,eax
-  movdqa      [rsp+30h],xmm6
-  punpckhbw   xmm9,xmm1
-  punpckhbw   xmm8,xmm1
-  punpcklwd   xmm0,xmm0
-  movsx       eax,word [rsp+1C0h+160]   ; iBeta
-  mov         word [rsp+4],r8w
-  mov         word [rsp+2],r9w
-  pshufd      xmm12,xmm0,0
-  mov         word [rsp],r9w
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  movdqa      xmm14, [rsp]
-  movdqa      [rsp],xmm2
-  movdqa      xmm2,xmm12
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  psubw       xmm10,xmm14
-  movd        xmm0,eax
-  movdqa      xmm7,xmm14
-  movdqa      xmm6,xmm14
-  pcmpgtw     xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  movdqa      xmm0,xmm4
-  movdqa      xmm1,xmm15
-  psubw       xmm4,xmm13
-  psubw       xmm0,xmm3
-  psubw       xmm1,xmm13
-  psubw       xmm3,xmm15
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm10
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm11
-  movdqa      xmm0,xmm13
-  psubw       xmm0,xmm15
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm11
-  movdqa      xmm3,[rsp+30h]
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm9
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm8
-  psubw       xmm9,xmm3
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  psubw       xmm15,xmm6
-  paddw       xmm13,xmm6
-  movdqa      xmm2,[rsp]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  psubw       xmm8,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm3
-  movdqa      xmm5,[rsp+10h]
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  movdqa      xmm4,xmm5
-  pabsw       xmm0,xmm0
-  pmaxsw      xmm10,xmm1
-  movdqa      xmm1,xmm11
-  pcmpgtw     xmm12,xmm0
-  pabsw       xmm0,xmm9
-  pminsw      xmm14,xmm10
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm8
-  pcmpgtw     xmm11,xmm0
-  pand        xmm12,xmm1
-  movdqa      xmm1,[rsp+20h]
-  pand        xmm12,xmm11
-  pand        xmm12,xmm7
-  pand        xmm14,xmm12
-  paddw       xmm3,xmm14
-  psubw       xmm2,xmm14
-  packuswb    xmm13,xmm3
-  packuswb    xmm15,xmm2
-  punpcklbw   xmm4,xmm13
-  punpckhbw   xmm5,xmm13
-  movdqa      xmm0,xmm15
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm4
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm2,xmm3
-  movdqa      xmm1,xmm4
-  punpcklwd   xmm0,xmm15
-  punpckhwd   xmm5,xmm15
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm5
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm5
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+40h],xmm0
-  movdqa      xmm0,xmm3
-  movdqa      [rsp+90h],xmm2
-  mov         eax,[rsp+40h]
-  mov         [rdi-2],eax
-  mov         eax, [rsp+90h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [rsi+rdi-2],eax
-  movdqa      [rsp+50h],xmm0
-  mov         eax,[rsp+50h]
-  movdqa      [rsp+0A0h],xmm3
-  mov         [rdi+rsi*2-2],eax
-  mov         eax,[rsp+0A0h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+48h]
-  mov         [rbx],eax
-  mov         eax,[rsp+98h]
-  mov         [rsi+rbx],eax
-  mov         eax,[rsp+58h]
-  mov         [rbx+rsi*2],eax
-  mov         eax, [rsp+0A8h]
-  mov         [r10+rbx],eax
-  mov         eax, [rsp+44h]
-  mov         [r12-2],eax
-  mov         eax,[rsp+94h]
-  mov         [rsi+r12-2],eax
-  mov         eax,[rsp+54h]
-  mov         [r12+rsi*2-2],eax
-  mov         eax, [rsp+0A4h]
-  mov         [r10+r12-2],eax
-  mov         eax,[rsp+4Ch]
-  mov         [rbp],eax
-  mov         eax,[rsp+9Ch]
-  mov         [rsi+rbp],eax
-  mov         eax, [rsp+5Ch]
-  mov         [rbp+rsi*2],eax
-  mov         eax,[rsp+0ACh]
-  mov         [r10+rbp],eax
-  lea         r11,[rsp+170h]
-  mov         rsp,r11
-  POP_XMM
-  pop         r12
-  pop         rdi
-  pop         rsi
-  pop         rbp
-  pop         rbx
-  ret
+    movsxd      rsi,r8d
+    lea         eax,[r8*4]
+    mov         r11d,r9d
+    movsxd      r10,eax
+    mov         eax, [rcx-2]
+    mov         r12,rdx
+    mov         [rsp+40h],eax
+    mov         eax, [rsi+rcx-2]
+    lea         rbx,[r10+rcx-2]
+    movdqa      xmm5,[rsp+40h]
+    mov         [rsp+50h],eax
+    mov         eax, [rcx+rsi*2-2]
+    lea         rbp,[r10+rdx-2]
+    movdqa      xmm2, [rsp+50h]
+    mov         [rsp+60h],eax
+    lea         r10,[rsi+rsi*2]
+    mov         rdi,rcx
+    mov         eax,[r10+rcx-2]
+    movdqa      xmm4,[rsp+60h]
+    mov         [rsp+70h],eax
+    mov         eax,[rdx-2]
+    mov         [rsp+80h],eax
+    mov         eax, [rsi+rdx-2]
+    movdqa      xmm3,[rsp+70h]
+    mov         [rsp+90h],eax
+    mov         eax,[rdx+rsi*2-2]
+    punpckldq   xmm5,[rsp+80h]
+    mov         [rsp+0A0h],eax
+    mov         eax, [r10+rdx-2]
+    punpckldq   xmm2,[rsp+90h]
+    mov         [rsp+0B0h],eax
+    mov         eax, [rbx]
+    punpckldq   xmm4,[rsp+0A0h]
+    mov         [rsp+80h],eax
+    mov         eax,[rbp]
+    punpckldq   xmm3,[rsp+0B0h]
+    mov         [rsp+90h],eax
+    mov         eax,[rsi+rbx]
+    movdqa      xmm0,[rsp+80h]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rsi+rbp]
+    movdqa      xmm0,[rsp+80h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+90h],eax
+    mov         eax,[rbx+rsi*2]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rbp+rsi*2]
+    movdqa      xmm0, [rsp+80h]
+    mov         [rsp+90h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm7,xmm1
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax, [r10+rbp]
+    movdqa      xmm0,[rsp+80h]
+    mov         [rsp+90h],eax
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm7,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm6,xmm7
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm6,xmm0
+    punpckhdq   xmm7,xmm0
+    movdqa      xmm0,xmm1
+    punpckldq   xmm0,xmm5
+    mov         rax, [rsp+1C8h+160]    ; pTC
+    punpckhdq   xmm1,xmm5
+    movdqa      xmm9,xmm6
+    punpckhqdq  xmm6,xmm0
+    punpcklqdq  xmm9,xmm0
+    movdqa      xmm2,xmm7
+    movdqa      xmm13,xmm6
+    movdqa      xmm4,xmm9
+    movdqa      [rsp+10h],xmm9
+    punpcklqdq  xmm2,xmm1
+    punpckhqdq  xmm7,xmm1
+    pxor        xmm1,xmm1
+    movsx       ecx,byte [rax+3]
+    movsx       edx,byte [rax+2]
+    movsx       r8d,byte [rax+1]
+    movsx       r9d,byte [rax]
+    movdqa      xmm10,xmm1
+    movdqa      xmm15,xmm2
+    punpckhbw   xmm2,xmm1
+    punpckhbw   xmm6,xmm1
+    punpcklbw   xmm4,xmm1
+    movsx       eax,r11w
+    mov         word [rsp+0Eh],cx
+    mov         word [rsp+0Ch],cx
+    movdqa      xmm3,xmm7
+    movdqa      xmm8,xmm7
+    movdqa      [rsp+20h],xmm7
+    punpcklbw   xmm15,xmm1
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm3,xmm1
+    mov         word [rsp+0Ah],dx
+    mov         word [rsp+8],dx
+    mov         word [rsp+6],r8w
+    movd        xmm0,eax
+    movdqa      [rsp+30h],xmm6
+    punpckhbw   xmm9,xmm1
+    punpckhbw   xmm8,xmm1
+    punpcklwd   xmm0,xmm0
+    movsx       eax,word [rsp+1C0h+160]   ; iBeta
+    mov         word [rsp+4],r8w
+    mov         word [rsp+2],r9w
+    pshufd      xmm12,xmm0,0
+    mov         word [rsp],r9w
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    movdqa      xmm14, [rsp]
+    movdqa      [rsp],xmm2
+    movdqa      xmm2,xmm12
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    psubw       xmm10,xmm14
+    movd        xmm0,eax
+    movdqa      xmm7,xmm14
+    movdqa      xmm6,xmm14
+    pcmpgtw     xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    movdqa      xmm0,xmm4
+    movdqa      xmm1,xmm15
+    psubw       xmm4,xmm13
+    psubw       xmm0,xmm3
+    psubw       xmm1,xmm13
+    psubw       xmm3,xmm15
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm10
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm11
+    movdqa      xmm0,xmm13
+    psubw       xmm0,xmm15
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm11
+    movdqa      xmm3,[rsp+30h]
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm9
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm8
+    psubw       xmm9,xmm3
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    psubw       xmm15,xmm6
+    paddw       xmm13,xmm6
+    movdqa      xmm2,[rsp]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    psubw       xmm8,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm3
+    movdqa      xmm5,[rsp+10h]
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    movdqa      xmm4,xmm5
+    pabsw       xmm0,xmm0
+    pmaxsw      xmm10,xmm1
+    movdqa      xmm1,xmm11
+    pcmpgtw     xmm12,xmm0
+    pabsw       xmm0,xmm9
+    pminsw      xmm14,xmm10
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm8
+    pcmpgtw     xmm11,xmm0
+    pand        xmm12,xmm1
+    movdqa      xmm1,[rsp+20h]
+    pand        xmm12,xmm11
+    pand        xmm12,xmm7
+    pand        xmm14,xmm12
+    paddw       xmm3,xmm14
+    psubw       xmm2,xmm14
+    packuswb    xmm13,xmm3
+    packuswb    xmm15,xmm2
+    punpcklbw   xmm4,xmm13
+    punpckhbw   xmm5,xmm13
+    movdqa      xmm0,xmm15
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm4
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm2,xmm3
+    movdqa      xmm1,xmm4
+    punpcklwd   xmm0,xmm15
+    punpckhwd   xmm5,xmm15
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm5
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm5
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+40h],xmm0
+    movdqa      xmm0,xmm3
+    movdqa      [rsp+90h],xmm2
+    mov         eax,[rsp+40h]
+    mov         [rdi-2],eax
+    mov         eax, [rsp+90h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [rsi+rdi-2],eax
+    movdqa      [rsp+50h],xmm0
+    mov         eax,[rsp+50h]
+    movdqa      [rsp+0A0h],xmm3
+    mov         [rdi+rsi*2-2],eax
+    mov         eax,[rsp+0A0h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+48h]
+    mov         [rbx],eax
+    mov         eax,[rsp+98h]
+    mov         [rsi+rbx],eax
+    mov         eax,[rsp+58h]
+    mov         [rbx+rsi*2],eax
+    mov         eax, [rsp+0A8h]
+    mov         [r10+rbx],eax
+    mov         eax, [rsp+44h]
+    mov         [r12-2],eax
+    mov         eax,[rsp+94h]
+    mov         [rsi+r12-2],eax
+    mov         eax,[rsp+54h]
+    mov         [r12+rsi*2-2],eax
+    mov         eax, [rsp+0A4h]
+    mov         [r10+r12-2],eax
+    mov         eax,[rsp+4Ch]
+    mov         [rbp],eax
+    mov         eax,[rsp+9Ch]
+    mov         [rsi+rbp],eax
+    mov         eax, [rsp+5Ch]
+    mov         [rbp+rsi*2],eax
+    mov         eax,[rsp+0ACh]
+    mov         [r10+rbp],eax
+    lea         r11,[rsp+170h]
+    mov         rsp,r11
+    POP_XMM
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+    ret
 
 
 
@@ -1638,1591 +1638,1591 @@
 
 
 WELS_EXTERN   DeblockLumaLt4V_ssse3
-  push        rbp
-  mov         r11,r8  ; pTC
-  sub         rsp,1B0h
-  lea         rbp,[rsp+20h]
-  movd        xmm4,edx
-  movd        xmm2,ecx
-  mov         qword [rbp+180h],r12
-  mov         r10,rdi
-  movsxd      r12,esi
-  add         rsi,rsi
-  movsxd      rdx,esi
-  sub         r10,r12
-  movsx       r8d,byte [r11]
-  pxor        xmm3,xmm3
-  punpcklwd   xmm2,xmm2
-  movaps      [rbp+50h],xmm14
-  lea         rax,[r12+r12*2]
-  movdqa      xmm14,[rdx+rdi]
-  neg         rax
-  pshufd      xmm0,xmm2,0
-  movd        xmm2,r8d
-  movsx       rsi,byte [r11+1]
-  movsx       r8d,byte [r11+2]
-  movsx       r11d,byte [r11+3]
-  movaps      [rbp+70h],xmm12
-  movd        xmm1,esi
-  movaps      [rbp+80h],xmm11
-  movd        xmm12,r8d
-  movd        xmm11,r11d
-  movdqa      xmm5, [rax+rdi]
-  lea         rax,[r12+r12]
-  punpcklwd   xmm12,xmm12
-  neg         rax
-  punpcklwd   xmm11,xmm11
-  movaps      [rbp],xmm8
-  movdqa      xmm8, [r10]
-  punpcklwd   xmm2,xmm2
-  punpcklwd   xmm1,xmm1
-  punpcklqdq  xmm12,xmm12
-  punpcklqdq  xmm11,xmm11
-  punpcklqdq  xmm2,xmm2
-  punpcklqdq  xmm1,xmm1
-  shufps      xmm12,xmm11,88h
-  movdqa      xmm11,xmm8
-  movaps      [rbp+30h],xmm9
-  movdqa      xmm9,[rdi]
-  shufps      xmm2,xmm1,88h
-  movdqa      xmm1,xmm5
-  punpcklbw   xmm11,xmm3
-  movaps      [rbp+20h],xmm6
-  movaps      [rbp+60h],xmm13
-  movdqa      xmm13,xmm11
-  movaps      [rbp+90h],xmm10
-  movdqa      xmm10,xmm9
-  movdqa      xmm6,[rax+rdi]
-  punpcklbw   xmm1,xmm3
-  movaps      [rbp+0A0h],xmm12
-  psubw       xmm13,xmm1
-  movaps      [rbp+40h],xmm15
-  movdqa      xmm15,xmm14
-  movaps      [rbp+10h],xmm7
-  movdqa      xmm7,xmm6
-  punpcklbw   xmm10,xmm3
-  movdqa      xmm12,[r12+rdi]
-  punpcklbw   xmm7,xmm3
-  punpcklbw   xmm12,xmm3
-  punpcklbw   xmm15,xmm3
-  pabsw       xmm3,xmm13
-  movdqa      xmm13,xmm10
-  psubw       xmm13,xmm15
-  movdqa      [rbp+0F0h],xmm15
-  pabsw       xmm15,xmm13
-  movdqa      xmm13,xmm11
-  movdqa      [rbp+0B0h],xmm1
-  movdqa      xmm1,xmm0
-  pavgw       xmm13,xmm10
-  pcmpgtw     xmm1,xmm3
-  movdqa      [rbp+120h],xmm13
-  movaps      xmm13,xmm2
-  punpcklwd   xmm4,xmm4
-  movdqa      xmm3,xmm0
-  movdqa      [rbp+100h],xmm1
-  psubw       xmm13,xmm1
-  movdqa      xmm1,xmm10
-  pcmpgtw     xmm3,xmm15
-  pshufd      xmm4,xmm4,0
-  psubw       xmm1,xmm11
-  movdqa      [rbp+0D0h],xmm10
-  psubw       xmm13,xmm3
-  movdqa      [rbp+110h],xmm3
-  pabsw       xmm15,xmm1
-  movdqa      xmm3,xmm4
-  psubw       xmm10,xmm12
-  pcmpgtw     xmm3,xmm15
-  pabsw       xmm15,xmm10
-  movdqa      xmm10,xmm0
-  psllw       xmm1,2
-  movdqa      [rbp+0C0h],xmm11
-  psubw       xmm11,xmm7
-  pcmpgtw     xmm10,xmm15
-  pabsw       xmm11,xmm11
-  movdqa      xmm15,xmm0
-  pand        xmm3,xmm10
-  pcmpgtw     xmm15,xmm11
-  movaps      xmm11,xmm2
-  pxor        xmm10,xmm10
-  pand        xmm3,xmm15
-  pcmpgtw     xmm11,xmm10
-  pcmpeqw     xmm10,xmm2
-  por         xmm11,xmm10
-  pand        xmm3,xmm11
-  movdqa      xmm11,xmm7
-  psubw       xmm11,xmm12
-  pxor        xmm15,xmm15
-  paddw       xmm11,xmm1
-  psubw       xmm15,xmm13
-  movdqa      [rbp+0E0h],xmm12
-  paddw       xmm11,[FOUR_16B_SSE2]
-  pxor        xmm12,xmm12
-  psraw       xmm11,3
-  punpckhbw   xmm8,xmm12
-  pmaxsw      xmm15,xmm11
-  punpckhbw   xmm5,xmm12
-  movdqa      xmm11,xmm8
-  pminsw      xmm13,xmm15
-  psubw       xmm11,xmm5
-  punpckhbw   xmm9,xmm12
-  pand        xmm13,xmm3
-  movdqa      [rbp+130h],xmm13
-  pabsw       xmm13,xmm11
-  punpckhbw   xmm14,xmm12
-  movdqa      xmm11,xmm9
-  psubw       xmm11,xmm14
-  movdqa      xmm15,xmm0
-  movdqa      [rbp+140h],xmm14
-  pabsw       xmm14,xmm11
-  movdqa      xmm11,xmm8
-  pcmpgtw     xmm15,xmm14
-  movdqa      xmm1,[r12+rdi]
-  pavgw       xmm11,xmm9
-  movdqa      [rbp+170h],xmm11
-  movdqa      xmm10,xmm9
-  punpckhbw   xmm6,xmm12
-  psubw       xmm10,xmm8
-  punpckhbw   xmm1,xmm12
-  movdqa      xmm12,xmm0
-  movaps      xmm11,[rbp+0A0h]
-  pcmpgtw     xmm12,xmm13
-  movaps      xmm13,xmm11
-  psubw       xmm13,xmm12
-  movdqa      [rbp+160h],xmm15
-  psubw       xmm13,xmm15
-  movdqa      xmm15,xmm9
-  psubw       xmm15,xmm1
-  movdqa      [rbp+150h],xmm12
-  pabsw       xmm12,xmm10
-  pabsw       xmm14,xmm15
-  movdqa      xmm15,xmm8
-  pcmpgtw     xmm4,xmm12
-  movdqa      xmm12,xmm0
-  psubw       xmm15,xmm6
-  pcmpgtw     xmm12,xmm14
-  pabsw       xmm14,xmm15
-  psllw       xmm10,2
-  pcmpgtw     xmm0,xmm14
-  movdqa      xmm14,xmm6
-  psubw       xmm14,xmm1
-  pand        xmm4,xmm12
-  paddw       xmm14,xmm10
-  pand        xmm4,xmm0
-  paddw       xmm14,[FOUR_16B_SSE2]
-  pxor        xmm15,xmm15
-  movaps      xmm12,xmm11
-  psubw       xmm15,xmm13
-  pxor        xmm0,xmm0
-  psraw       xmm14,3
-  pcmpgtw     xmm12,xmm0
-  pcmpeqw     xmm0,xmm11
-  pmaxsw      xmm15,xmm14
-  por         xmm12,xmm0
-  movdqa      xmm0,[rbp+120h]
-  pminsw      xmm13,xmm15
-  movdqa      xmm15,[rbp+0B0h]
-  movdqa      xmm10,xmm7
-  pand        xmm4,xmm12
-  paddw       xmm15,xmm0
-  pxor        xmm12,xmm12
-  paddw       xmm10,xmm7
-  movdqa      xmm14,xmm12
-  psubw       xmm15,xmm10
-  psubw       xmm14,xmm2
-  psraw       xmm15,1
-  pmaxsw      xmm15,xmm14
-  movdqa      xmm10,xmm6
-  pminsw      xmm15,xmm2
-  paddw       xmm10,xmm6
-  pand        xmm15,xmm3
-  psubw       xmm12,xmm11
-  pand        xmm15,[rbp+100h]
-  pand        xmm13,xmm4
-  paddw       xmm7,xmm15
-  paddw       xmm8,xmm13
-  movdqa      xmm15,[rbp+170h]
-  psubw       xmm9,xmm13
-  paddw       xmm5,xmm15
-  psubw       xmm5,xmm10
-  psraw       xmm5,1
-  pmaxsw      xmm5,xmm12
-  pminsw      xmm5,xmm11
-  pand        xmm5,xmm4
-  pand        xmm5,[rbp+150h]
-  paddw       xmm6,xmm5
-  movdqa      xmm5,[rbp+0C0h]
-  packuswb    xmm7,xmm6
-  movdqa      xmm6,[rbp+130h]
-  paddw       xmm5,xmm6
-  packuswb    xmm5,xmm8
-  movdqa      xmm8,[rbp+0D0h]
-  psubw       xmm8,xmm6
-  movdqa      xmm6,[rbp+0F0h]
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[rbp+0E0h]
-  packuswb    xmm8,xmm9
-  movdqa      xmm9,xmm0
-  paddw       xmm9,xmm0
-  psubw       xmm6,xmm9
-  psraw       xmm6,1
-  pmaxsw      xmm14,xmm6
-  pminsw      xmm2,xmm14
-  pand        xmm2,xmm3
-  pand        xmm2,[rbp+110h]
-  paddw       xmm0,xmm2
-  movdqa      xmm2,[rbp+140h]
-  paddw       xmm2,xmm15
-  movdqa      xmm15,xmm1
-  paddw       xmm15,xmm1
-  psubw       xmm2,xmm15
-  psraw       xmm2,1
-  pmaxsw      xmm12,xmm2
-  pminsw      xmm11,xmm12
-  pand        xmm11,xmm4
-  pand        xmm11,[rbp+160h]
-  paddw       xmm1,xmm11
-  movdqa      [rax+rdi],xmm7
-  movdqa      [r10],xmm5
-  packuswb    xmm0,xmm1
-  movdqa      [rdi],xmm8
-  movdqa      [r12+rdi],xmm0
-  mov         r12,qword [rbp+180h]
-  lea         rsp,[rbp+190h]
-  pop         rbp
-  ret
+    push        rbp
+    mov         r11,r8  ; pTC
+    sub         rsp,1B0h
+    lea         rbp,[rsp+20h]
+    movd        xmm4,edx
+    movd        xmm2,ecx
+    mov         qword [rbp+180h],r12
+    mov         r10,rdi
+    movsxd      r12,esi
+    add         rsi,rsi
+    movsxd      rdx,esi
+    sub         r10,r12
+    movsx       r8d,byte [r11]
+    pxor        xmm3,xmm3
+    punpcklwd   xmm2,xmm2
+    movaps      [rbp+50h],xmm14
+    lea         rax,[r12+r12*2]
+    movdqa      xmm14,[rdx+rdi]
+    neg         rax
+    pshufd      xmm0,xmm2,0
+    movd        xmm2,r8d
+    movsx       rsi,byte [r11+1]
+    movsx       r8d,byte [r11+2]
+    movsx       r11d,byte [r11+3]
+    movaps      [rbp+70h],xmm12
+    movd        xmm1,esi
+    movaps      [rbp+80h],xmm11
+    movd        xmm12,r8d
+    movd        xmm11,r11d
+    movdqa      xmm5, [rax+rdi]
+    lea         rax,[r12+r12]
+    punpcklwd   xmm12,xmm12
+    neg         rax
+    punpcklwd   xmm11,xmm11
+    movaps      [rbp],xmm8
+    movdqa      xmm8, [r10]
+    punpcklwd   xmm2,xmm2
+    punpcklwd   xmm1,xmm1
+    punpcklqdq  xmm12,xmm12
+    punpcklqdq  xmm11,xmm11
+    punpcklqdq  xmm2,xmm2
+    punpcklqdq  xmm1,xmm1
+    shufps      xmm12,xmm11,88h
+    movdqa      xmm11,xmm8
+    movaps      [rbp+30h],xmm9
+    movdqa      xmm9,[rdi]
+    shufps      xmm2,xmm1,88h
+    movdqa      xmm1,xmm5
+    punpcklbw   xmm11,xmm3
+    movaps      [rbp+20h],xmm6
+    movaps      [rbp+60h],xmm13
+    movdqa      xmm13,xmm11
+    movaps      [rbp+90h],xmm10
+    movdqa      xmm10,xmm9
+    movdqa      xmm6,[rax+rdi]
+    punpcklbw   xmm1,xmm3
+    movaps      [rbp+0A0h],xmm12
+    psubw       xmm13,xmm1
+    movaps      [rbp+40h],xmm15
+    movdqa      xmm15,xmm14
+    movaps      [rbp+10h],xmm7
+    movdqa      xmm7,xmm6
+    punpcklbw   xmm10,xmm3
+    movdqa      xmm12,[r12+rdi]
+    punpcklbw   xmm7,xmm3
+    punpcklbw   xmm12,xmm3
+    punpcklbw   xmm15,xmm3
+    pabsw       xmm3,xmm13
+    movdqa      xmm13,xmm10
+    psubw       xmm13,xmm15
+    movdqa      [rbp+0F0h],xmm15
+    pabsw       xmm15,xmm13
+    movdqa      xmm13,xmm11
+    movdqa      [rbp+0B0h],xmm1
+    movdqa      xmm1,xmm0
+    pavgw       xmm13,xmm10
+    pcmpgtw     xmm1,xmm3
+    movdqa      [rbp+120h],xmm13
+    movaps      xmm13,xmm2
+    punpcklwd   xmm4,xmm4
+    movdqa      xmm3,xmm0
+    movdqa      [rbp+100h],xmm1
+    psubw       xmm13,xmm1
+    movdqa      xmm1,xmm10
+    pcmpgtw     xmm3,xmm15
+    pshufd      xmm4,xmm4,0
+    psubw       xmm1,xmm11
+    movdqa      [rbp+0D0h],xmm10
+    psubw       xmm13,xmm3
+    movdqa      [rbp+110h],xmm3
+    pabsw       xmm15,xmm1
+    movdqa      xmm3,xmm4
+    psubw       xmm10,xmm12
+    pcmpgtw     xmm3,xmm15
+    pabsw       xmm15,xmm10
+    movdqa      xmm10,xmm0
+    psllw       xmm1,2
+    movdqa      [rbp+0C0h],xmm11
+    psubw       xmm11,xmm7
+    pcmpgtw     xmm10,xmm15
+    pabsw       xmm11,xmm11
+    movdqa      xmm15,xmm0
+    pand        xmm3,xmm10
+    pcmpgtw     xmm15,xmm11
+    movaps      xmm11,xmm2
+    pxor        xmm10,xmm10
+    pand        xmm3,xmm15
+    pcmpgtw     xmm11,xmm10
+    pcmpeqw     xmm10,xmm2
+    por         xmm11,xmm10
+    pand        xmm3,xmm11
+    movdqa      xmm11,xmm7
+    psubw       xmm11,xmm12
+    pxor        xmm15,xmm15
+    paddw       xmm11,xmm1
+    psubw       xmm15,xmm13
+    movdqa      [rbp+0E0h],xmm12
+    paddw       xmm11,[FOUR_16B_SSE2]
+    pxor        xmm12,xmm12
+    psraw       xmm11,3
+    punpckhbw   xmm8,xmm12
+    pmaxsw      xmm15,xmm11
+    punpckhbw   xmm5,xmm12
+    movdqa      xmm11,xmm8
+    pminsw      xmm13,xmm15
+    psubw       xmm11,xmm5
+    punpckhbw   xmm9,xmm12
+    pand        xmm13,xmm3
+    movdqa      [rbp+130h],xmm13
+    pabsw       xmm13,xmm11
+    punpckhbw   xmm14,xmm12
+    movdqa      xmm11,xmm9
+    psubw       xmm11,xmm14
+    movdqa      xmm15,xmm0
+    movdqa      [rbp+140h],xmm14
+    pabsw       xmm14,xmm11
+    movdqa      xmm11,xmm8
+    pcmpgtw     xmm15,xmm14
+    movdqa      xmm1,[r12+rdi]
+    pavgw       xmm11,xmm9
+    movdqa      [rbp+170h],xmm11
+    movdqa      xmm10,xmm9
+    punpckhbw   xmm6,xmm12
+    psubw       xmm10,xmm8
+    punpckhbw   xmm1,xmm12
+    movdqa      xmm12,xmm0
+    movaps      xmm11,[rbp+0A0h]
+    pcmpgtw     xmm12,xmm13
+    movaps      xmm13,xmm11
+    psubw       xmm13,xmm12
+    movdqa      [rbp+160h],xmm15
+    psubw       xmm13,xmm15
+    movdqa      xmm15,xmm9
+    psubw       xmm15,xmm1
+    movdqa      [rbp+150h],xmm12
+    pabsw       xmm12,xmm10
+    pabsw       xmm14,xmm15
+    movdqa      xmm15,xmm8
+    pcmpgtw     xmm4,xmm12
+    movdqa      xmm12,xmm0
+    psubw       xmm15,xmm6
+    pcmpgtw     xmm12,xmm14
+    pabsw       xmm14,xmm15
+    psllw       xmm10,2
+    pcmpgtw     xmm0,xmm14
+    movdqa      xmm14,xmm6
+    psubw       xmm14,xmm1
+    pand        xmm4,xmm12
+    paddw       xmm14,xmm10
+    pand        xmm4,xmm0
+    paddw       xmm14,[FOUR_16B_SSE2]
+    pxor        xmm15,xmm15
+    movaps      xmm12,xmm11
+    psubw       xmm15,xmm13
+    pxor        xmm0,xmm0
+    psraw       xmm14,3
+    pcmpgtw     xmm12,xmm0
+    pcmpeqw     xmm0,xmm11
+    pmaxsw      xmm15,xmm14
+    por         xmm12,xmm0
+    movdqa      xmm0,[rbp+120h]
+    pminsw      xmm13,xmm15
+    movdqa      xmm15,[rbp+0B0h]
+    movdqa      xmm10,xmm7
+    pand        xmm4,xmm12
+    paddw       xmm15,xmm0
+    pxor        xmm12,xmm12
+    paddw       xmm10,xmm7
+    movdqa      xmm14,xmm12
+    psubw       xmm15,xmm10
+    psubw       xmm14,xmm2
+    psraw       xmm15,1
+    pmaxsw      xmm15,xmm14
+    movdqa      xmm10,xmm6
+    pminsw      xmm15,xmm2
+    paddw       xmm10,xmm6
+    pand        xmm15,xmm3
+    psubw       xmm12,xmm11
+    pand        xmm15,[rbp+100h]
+    pand        xmm13,xmm4
+    paddw       xmm7,xmm15
+    paddw       xmm8,xmm13
+    movdqa      xmm15,[rbp+170h]
+    psubw       xmm9,xmm13
+    paddw       xmm5,xmm15
+    psubw       xmm5,xmm10
+    psraw       xmm5,1
+    pmaxsw      xmm5,xmm12
+    pminsw      xmm5,xmm11
+    pand        xmm5,xmm4
+    pand        xmm5,[rbp+150h]
+    paddw       xmm6,xmm5
+    movdqa      xmm5,[rbp+0C0h]
+    packuswb    xmm7,xmm6
+    movdqa      xmm6,[rbp+130h]
+    paddw       xmm5,xmm6
+    packuswb    xmm5,xmm8
+    movdqa      xmm8,[rbp+0D0h]
+    psubw       xmm8,xmm6
+    movdqa      xmm6,[rbp+0F0h]
+    paddw       xmm6,xmm0
+    movdqa      xmm0,[rbp+0E0h]
+    packuswb    xmm8,xmm9
+    movdqa      xmm9,xmm0
+    paddw       xmm9,xmm0
+    psubw       xmm6,xmm9
+    psraw       xmm6,1
+    pmaxsw      xmm14,xmm6
+    pminsw      xmm2,xmm14
+    pand        xmm2,xmm3
+    pand        xmm2,[rbp+110h]
+    paddw       xmm0,xmm2
+    movdqa      xmm2,[rbp+140h]
+    paddw       xmm2,xmm15
+    movdqa      xmm15,xmm1
+    paddw       xmm15,xmm1
+    psubw       xmm2,xmm15
+    psraw       xmm2,1
+    pmaxsw      xmm12,xmm2
+    pminsw      xmm11,xmm12
+    pand        xmm11,xmm4
+    pand        xmm11,[rbp+160h]
+    paddw       xmm1,xmm11
+    movdqa      [rax+rdi],xmm7
+    movdqa      [r10],xmm5
+    packuswb    xmm0,xmm1
+    movdqa      [rdi],xmm8
+    movdqa      [r12+rdi],xmm0
+    mov         r12,qword [rbp+180h]
+    lea         rsp,[rbp+190h]
+    pop         rbp
+    ret
 
 
 WELS_EXTERN DeblockLumaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  mov         r8,   rdx
-  mov         r9,   rcx
-  mov         rcx,  rdi
-  mov         rdx,  rsi
-  sub         rsp,1D8h
-  movaps      [rax-38h],xmm6
-  movaps      [rax-48h],xmm7
-  movaps      [rax-58h],xmm8
-  pxor        xmm1,xmm1
-  movsxd      r10,edx
-  mov         rbp,rcx
-  mov         r11d,r8d
-  mov         rdx,rcx
-  mov         rdi,rbp
-  mov         rbx,rbp
-  movdqa      xmm5,[rbp]
-  movaps      [rax-68h],xmm9
-  movaps      [rax-78h],xmm10
-  punpcklbw   xmm5,xmm1
-  movaps      [rax-88h],xmm11
-  movaps      [rax-98h],xmm12
-  movaps      [rax-0A8h],xmm13
-  movaps      [rax-0B8h],xmm14
-  movdqa      xmm14,[r10+rbp]
-  movaps      [rax-0C8h],xmm15
-  lea         eax,[r10*4]
-  movsxd      r8,eax
-  lea         eax,[r10+r10*2]
-  movsxd      rcx,eax
-  lea         eax,[r10+r10]
-  sub         rdx,r8
-  punpcklbw   xmm14,xmm1
-  movdqa      [rsp+90h],xmm5
-  movdqa      [rsp+30h],xmm14
-  movsxd      rsi,eax
-  movsx       eax,r11w
-  sub         rdi,rcx
-  sub         rbx,rsi
-  mov         r8,rbp
-  sub         r8,r10
-  movd        xmm0,eax
-  movsx       eax,r9w
-  movdqa      xmm12,[rdi]
-  movdqa      xmm6, [rsi+rbp]
-  movdqa      xmm13,[rbx]
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm6,xmm1
-  movdqa      xmm8,[r8]
-  movd        xmm0,eax
-  movdqa      xmm10,xmm11
-  mov         eax,2
-  punpcklbw   xmm8,xmm1
-  punpcklbw   xmm12,xmm1
-  cwde
-  punpcklwd   xmm0,xmm0
-  psraw       xmm10,2
-  movdqa      xmm1,xmm8
-  movdqa      [rsp+0F0h],xmm13
-  movdqa      [rsp+0B0h],xmm8
-  pshufd      xmm7,xmm0,0
-  psubw       xmm1,xmm13
-  movdqa      xmm0,xmm5
-  movdqa      xmm4,xmm7
-  movdqa      xmm2,xmm7
-  psubw       xmm0,xmm8
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm5
-  movdqa      [rsp+40h],xmm7
-  movdqa      [rsp+60h],xmm6
-  pcmpgtw     xmm4,xmm0
-  psubw       xmm1,xmm14
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm4,xmm2
-  movdqa      xmm0,xmm11
-  pcmpgtw     xmm0,xmm3
-  pand        xmm4,xmm0
-  movd        xmm0,eax
-  movdqa      [rsp+20h],xmm4
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm2,xmm0,0
-  paddw       xmm10,xmm2
-  movdqa      [rsp+0A0h],xmm2
-  movdqa      xmm15,xmm7
-  pxor        xmm4,xmm4
-  movdqa      xmm0,xmm8
-  psubw       xmm0,xmm12
-  mov         eax,4
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm10
-  cwde
-  pcmpgtw     xmm15,xmm0
-  pcmpgtw     xmm1,xmm3
-  movdqa      xmm3,xmm7
-  movdqa      xmm7,[rdx]
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm6
-  pand        xmm15,xmm1
-  punpcklbw   xmm7,xmm4
-  movdqa      xmm9,xmm15
-  pabsw       xmm0,xmm0
-  psllw       xmm7,1
-  pandn       xmm9,xmm12
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm7,xmm12
-  movd        xmm0,eax
-  pand        xmm3,xmm1
-  paddw       xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  paddw       xmm7,xmm12
-  pshufd      xmm1,xmm0,0
-  paddw       xmm7,xmm13
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm6
-  paddw       xmm7,xmm8
-  movdqa      [rsp+70h],xmm1
-  paddw       xmm7,xmm5
-  movdqa      [rsp+120h],xmm0
-  movdqa      xmm0,[rcx+rbp]
-  punpcklbw   xmm0,xmm4
-  paddw       xmm7,xmm1
-  movdqa      xmm4,xmm15
-  psllw       xmm0,1
-  psraw       xmm7,3
-  paddw       xmm0,xmm6
-  pand        xmm7,xmm15
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm14
-  movdqa      xmm6,xmm15
-  paddw       xmm0,xmm5
-  pandn       xmm6,xmm13
-  paddw       xmm0,xmm8
-  paddw       xmm0,xmm1
-  psraw       xmm0,3
-  movdqa      xmm1,xmm12
-  paddw       xmm1,xmm13
-  pand        xmm0,xmm3
-  movdqa      [rsp+100h],xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,xmm5
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm3
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pandn       xmm0,xmm14
-  pand        xmm4,xmm1
-  movdqa      [rsp+0E0h],xmm0
-  movdqa      xmm0,xmm5
-  paddw       xmm0,xmm8
-  movdqa      xmm1,[rsp+60h]
-  paddw       xmm1,xmm14
-  movdqa      xmm14,xmm3
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,[rsp+30h]
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pand        xmm14,xmm1
-  movdqa      xmm1,xmm13
-  paddw       xmm1,xmm13
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  movdqa      xmm0,[rsp+30h]
-  movdqa      xmm2,xmm13
-  movdqa      xmm5,xmm15
-  paddw       xmm0,[rsp+70h]
-  pandn       xmm5,xmm1
-  paddw       xmm2,xmm8
-  movdqa      xmm8,[rsp+90h]
-  movdqa      xmm1,xmm12
-  paddw       xmm2,xmm8
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,xmm8
-  movdqa      xmm8,xmm3
-  movdqa      xmm2,[rsp+30h]
-  paddw       xmm0,xmm13
-  psraw       xmm1,3
-  pand        xmm15,xmm1
-  movdqa      xmm1,xmm2
-  paddw       xmm1,xmm2
-  paddw       xmm2,[rsp+90h]
-  paddw       xmm2,[rsp+0B0h]
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  movdqa      xmm13,[r8]
-  paddw       xmm0, [rsp+70h]
-  paddw       xmm1, [rsp+0A0h]
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  psraw       xmm1,2
-  movdqa      xmm0, [rdi]
-  pandn       xmm8,xmm1
-  movdqa      xmm1, [rsp+60h]
-  paddw       xmm1,xmm2
-  movdqa      xmm2, [rbx]
-  psraw       xmm1,3
-  pand        xmm3,xmm1
-  movdqa      xmm1, [rbp]
-  movdqa      [rsp+0D0h],xmm3
-  pxor        xmm3,xmm3
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm1,xmm3
-  punpckhbw   xmm13,xmm3
-  movdqa      [rsp+0C0h],xmm0
-  movdqa      xmm0,[r10+rbp]
-  movdqa      [rsp],xmm1
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm2,xmm3
-  movdqa      [rsp+80h],xmm0
-  movdqa      xmm0,[rsi+rbp]
-  movdqa      [rsp+10h],xmm13
-  punpckhbw   xmm0,xmm3
-  movdqa      [rsp+50h],xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm1,xmm13
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm2
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,[rsp]
-  movdqa      xmm13,[rsp+40h]
-  movdqa      [rsp+110h],xmm2
-  psubw       xmm1, [rsp+80h]
-  pcmpgtw     xmm13,xmm0
-  pcmpgtw     xmm11,xmm3
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm10,xmm3
-  movdqa      xmm1, [rsp+40h]
-  movdqa      xmm2,xmm1
-  movdqa      xmm3,xmm1
-  pcmpgtw     xmm2,xmm0
-  movdqa      xmm0, [rsp+10h]
-  pand        xmm13,xmm2
-  pand        xmm13,xmm11
-  movdqa      xmm11,[rsp+0C0h]
-  psubw       xmm0,xmm11
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm3,xmm0
-  pand        xmm3,xmm10
-  movdqa      xmm0,[rsp]
-  psubw       xmm0,[rsp+50h]
-  movdqa      xmm2,[rdx]
-  pabsw       xmm0,xmm0
-  por         xmm7,xmm9
-  movdqa      xmm9,[rsp+20h]
-  pcmpgtw     xmm1,xmm0
-  pand        xmm9,xmm7
-  movdqa      xmm7,[rsp+20h]
-  movdqa      xmm0,xmm7
-  pandn       xmm0,xmm12
-  movdqa      xmm12,[rsp+110h]
-  pand        xmm1,xmm10
-  movdqa      xmm10,[rsp+70h]
-  movdqa      [rsp+40h],xmm1
-  movdqa      xmm1,xmm13
-  por         xmm9,xmm0
-  pxor        xmm0,xmm0
-  por         xmm4,xmm6
-  movdqa      xmm6,xmm7
-  punpckhbw   xmm2,xmm0
-  por         xmm15,xmm5
-  movdqa      xmm5,[rsp+20h]
-  movdqa      xmm0,xmm3
-  psllw       xmm2,1
-  pandn       xmm0,xmm11
-  pand        xmm6,xmm4
-  movdqa      xmm4,[rsp]
-  paddw       xmm2,xmm11
-  pand        xmm5,xmm15
-  movdqa      xmm15,[rsp+20h]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm12
-  paddw       xmm2,[rsp+10h]
-  paddw       xmm2,[rsp]
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  pand        xmm2,xmm3
-  por         xmm2,xmm0
-  pand        xmm1,xmm2
-  movdqa      xmm0,xmm13
-  movdqa      xmm2,xmm11
-  pandn       xmm0,xmm11
-  paddw       xmm2,xmm12
-  por         xmm1,xmm0
-  packuswb    xmm9,xmm1
-  movdqa      xmm0,xmm7
-  movdqa      xmm7,[rsp+0A0h]
-  pandn       xmm0,[rsp+0F0h]
-  movdqa      xmm1,xmm3
-  por         xmm6,xmm0
-  movdqa      xmm0,[rsp+10h]
-  paddw       xmm0,xmm4
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm12
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  pandn       xmm0,xmm12
-  movdqa      xmm1,xmm12
-  paddw       xmm1,[rsp+10h]
-  por         xmm2,xmm0
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+0B0h]
-  paddw       xmm1,xmm4
-  packuswb    xmm6,xmm2
-  movdqa      xmm2,xmm3
-  psllw       xmm1,1
-  por         xmm5,xmm0
-  movdqa      xmm0,[rsp+80h]
-  paddw       xmm0,xmm10
-  paddw       xmm1,xmm0
-  paddw       xmm11,xmm1
-  psraw       xmm11,3
-  movdqa      xmm1,xmm12
-  pand        xmm2,xmm11
-  paddw       xmm1,xmm12
-  movdqa      xmm11,[rsp+80h]
-  movdqa      xmm0, [rsp+10h]
-  por         xmm14,[rsp+0E0h]
-  paddw       xmm0,xmm11
-  movdqa      xmm4,xmm15
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  paddw       xmm1,xmm7
-  psraw       xmm1,2
-  pandn       xmm3,xmm1
-  por         xmm2,xmm3
-  movdqa      xmm1,xmm13
-  movdqa      xmm3,[rsp+10h]
-  pandn       xmm0,xmm3
-  pand        xmm1,xmm2
-  movdqa      xmm2,xmm11
-  paddw       xmm2,[rsp]
-  por         xmm1,xmm0
-  movdqa      xmm0,[rsp+0D0h]
-  por         xmm0,xmm8
-  paddw       xmm2,xmm3
-  packuswb    xmm5,xmm1
-  movdqa      xmm8,[rsp+40h]
-  movdqa      xmm1,[rsp+50h]
-  movdqa      xmm3,xmm8
-  pand        xmm4,xmm0
-  psllw       xmm2,1
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+90h]
-  por         xmm4,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm10
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,[rsp]
-  movdqa      xmm2,xmm11
-  paddw       xmm0,xmm12
-  movdqa      xmm12,[rsp]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm0
-  psraw       xmm1,3
-  movdqa      xmm0,xmm8
-  pand        xmm3,xmm1
-  paddw       xmm2,xmm7
-  movdqa      xmm1,xmm13
-  psraw       xmm2,2
-  pandn       xmm0,xmm2
-  por         xmm3,xmm0
-  movdqa      xmm2,[rsp+50h]
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm3
-  paddw       xmm2,xmm11
-  movdqa      xmm3,xmm15
-  por         xmm1,xmm0
-  pand        xmm3,xmm14
-  movdqa      xmm14,[rsp+10h]
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+30h]
-  packuswb    xmm4,xmm1
-  movdqa      xmm1,xmm8
-  por         xmm3,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm14
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm8
-  pandn       xmm0,xmm11
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm11
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm15
-  por         xmm2,xmm0
-  packuswb    xmm3,xmm2
-  movdqa      xmm0,[rsp+100h]
-  por         xmm0,[rsp+120h]
-  pand        xmm1,xmm0
-  movdqa      xmm2,[rcx+rbp]
-  movdqa      xmm7,[rsp+50h]
-  pandn       xmm15,[rsp+60h]
-  lea         r11,[rsp+1D8h]
-  pxor        xmm0,xmm0
-  por         xmm1,xmm15
-  movaps      xmm15,[r11-0A8h]
-  movdqa      [rdi],xmm9
-  movaps      xmm9,[r11-48h]
-  punpckhbw   xmm2,xmm0
-  psllw       xmm2,1
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm7
-  movdqa      [rbx],xmm6
-  movaps      xmm6,[r11-18h]
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm11
-  movaps      xmm11,[r11-68h]
-  paddw       xmm2,xmm12
-  movaps      xmm12,[r11-78h]
-  paddw       xmm2,xmm14
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  movaps      xmm10,[r11-58h]
-  movaps      xmm14,[r11-98h]
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm8
-  pandn       xmm8,xmm7
-  pandn       xmm13,xmm7
-  por         xmm2,xmm8
-  movaps      xmm7,[r11-28h]
-  movaps      xmm8,[r11-38h]
-  movdqa      [r8],xmm5
-  pand        xmm0,xmm2
-  por         xmm0,xmm13
-  packuswb    xmm1,xmm0
-  movaps      xmm13,[r11-88h]
-  movdqa      [rbp],xmm4
-  movdqa      [r10+rbp],xmm3
-  movdqa      [rsi+rbp],xmm1
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    mov         r8,   rdx
+    mov         r9,   rcx
+    mov         rcx,  rdi
+    mov         rdx,  rsi
+    sub         rsp,1D8h
+    movaps      [rax-38h],xmm6
+    movaps      [rax-48h],xmm7
+    movaps      [rax-58h],xmm8
+    pxor        xmm1,xmm1
+    movsxd      r10,edx
+    mov         rbp,rcx
+    mov         r11d,r8d
+    mov         rdx,rcx
+    mov         rdi,rbp
+    mov         rbx,rbp
+    movdqa      xmm5,[rbp]
+    movaps      [rax-68h],xmm9
+    movaps      [rax-78h],xmm10
+    punpcklbw   xmm5,xmm1
+    movaps      [rax-88h],xmm11
+    movaps      [rax-98h],xmm12
+    movaps      [rax-0A8h],xmm13
+    movaps      [rax-0B8h],xmm14
+    movdqa      xmm14,[r10+rbp]
+    movaps      [rax-0C8h],xmm15
+    lea         eax,[r10*4]
+    movsxd      r8,eax
+    lea         eax,[r10+r10*2]
+    movsxd      rcx,eax
+    lea         eax,[r10+r10]
+    sub         rdx,r8
+    punpcklbw   xmm14,xmm1
+    movdqa      [rsp+90h],xmm5
+    movdqa      [rsp+30h],xmm14
+    movsxd      rsi,eax
+    movsx       eax,r11w
+    sub         rdi,rcx
+    sub         rbx,rsi
+    mov         r8,rbp
+    sub         r8,r10
+    movd        xmm0,eax
+    movsx       eax,r9w
+    movdqa      xmm12,[rdi]
+    movdqa      xmm6, [rsi+rbp]
+    movdqa      xmm13,[rbx]
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm6,xmm1
+    movdqa      xmm8,[r8]
+    movd        xmm0,eax
+    movdqa      xmm10,xmm11
+    mov         eax,2
+    punpcklbw   xmm8,xmm1
+    punpcklbw   xmm12,xmm1
+    cwde
+    punpcklwd   xmm0,xmm0
+    psraw       xmm10,2
+    movdqa      xmm1,xmm8
+    movdqa      [rsp+0F0h],xmm13
+    movdqa      [rsp+0B0h],xmm8
+    pshufd      xmm7,xmm0,0
+    psubw       xmm1,xmm13
+    movdqa      xmm0,xmm5
+    movdqa      xmm4,xmm7
+    movdqa      xmm2,xmm7
+    psubw       xmm0,xmm8
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm5
+    movdqa      [rsp+40h],xmm7
+    movdqa      [rsp+60h],xmm6
+    pcmpgtw     xmm4,xmm0
+    psubw       xmm1,xmm14
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm4,xmm2
+    movdqa      xmm0,xmm11
+    pcmpgtw     xmm0,xmm3
+    pand        xmm4,xmm0
+    movd        xmm0,eax
+    movdqa      [rsp+20h],xmm4
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm2,xmm0,0
+    paddw       xmm10,xmm2
+    movdqa      [rsp+0A0h],xmm2
+    movdqa      xmm15,xmm7
+    pxor        xmm4,xmm4
+    movdqa      xmm0,xmm8
+    psubw       xmm0,xmm12
+    mov         eax,4
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm10
+    cwde
+    pcmpgtw     xmm15,xmm0
+    pcmpgtw     xmm1,xmm3
+    movdqa      xmm3,xmm7
+    movdqa      xmm7,[rdx]
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm6
+    pand        xmm15,xmm1
+    punpcklbw   xmm7,xmm4
+    movdqa      xmm9,xmm15
+    pabsw       xmm0,xmm0
+    psllw       xmm7,1
+    pandn       xmm9,xmm12
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm7,xmm12
+    movd        xmm0,eax
+    pand        xmm3,xmm1
+    paddw       xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    paddw       xmm7,xmm12
+    pshufd      xmm1,xmm0,0
+    paddw       xmm7,xmm13
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm6
+    paddw       xmm7,xmm8
+    movdqa      [rsp+70h],xmm1
+    paddw       xmm7,xmm5
+    movdqa      [rsp+120h],xmm0
+    movdqa      xmm0,[rcx+rbp]
+    punpcklbw   xmm0,xmm4
+    paddw       xmm7,xmm1
+    movdqa      xmm4,xmm15
+    psllw       xmm0,1
+    psraw       xmm7,3
+    paddw       xmm0,xmm6
+    pand        xmm7,xmm15
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm14
+    movdqa      xmm6,xmm15
+    paddw       xmm0,xmm5
+    pandn       xmm6,xmm13
+    paddw       xmm0,xmm8
+    paddw       xmm0,xmm1
+    psraw       xmm0,3
+    movdqa      xmm1,xmm12
+    paddw       xmm1,xmm13
+    pand        xmm0,xmm3
+    movdqa      [rsp+100h],xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,xmm5
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm3
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pandn       xmm0,xmm14
+    pand        xmm4,xmm1
+    movdqa      [rsp+0E0h],xmm0
+    movdqa      xmm0,xmm5
+    paddw       xmm0,xmm8
+    movdqa      xmm1,[rsp+60h]
+    paddw       xmm1,xmm14
+    movdqa      xmm14,xmm3
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,[rsp+30h]
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pand        xmm14,xmm1
+    movdqa      xmm1,xmm13
+    paddw       xmm1,xmm13
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    movdqa      xmm0,[rsp+30h]
+    movdqa      xmm2,xmm13
+    movdqa      xmm5,xmm15
+    paddw       xmm0,[rsp+70h]
+    pandn       xmm5,xmm1
+    paddw       xmm2,xmm8
+    movdqa      xmm8,[rsp+90h]
+    movdqa      xmm1,xmm12
+    paddw       xmm2,xmm8
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,xmm8
+    movdqa      xmm8,xmm3
+    movdqa      xmm2,[rsp+30h]
+    paddw       xmm0,xmm13
+    psraw       xmm1,3
+    pand        xmm15,xmm1
+    movdqa      xmm1,xmm2
+    paddw       xmm1,xmm2
+    paddw       xmm2,[rsp+90h]
+    paddw       xmm2,[rsp+0B0h]
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    movdqa      xmm13,[r8]
+    paddw       xmm0, [rsp+70h]
+    paddw       xmm1, [rsp+0A0h]
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    psraw       xmm1,2
+    movdqa      xmm0, [rdi]
+    pandn       xmm8,xmm1
+    movdqa      xmm1, [rsp+60h]
+    paddw       xmm1,xmm2
+    movdqa      xmm2, [rbx]
+    psraw       xmm1,3
+    pand        xmm3,xmm1
+    movdqa      xmm1, [rbp]
+    movdqa      [rsp+0D0h],xmm3
+    pxor        xmm3,xmm3
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm1,xmm3
+    punpckhbw   xmm13,xmm3
+    movdqa      [rsp+0C0h],xmm0
+    movdqa      xmm0,[r10+rbp]
+    movdqa      [rsp],xmm1
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm2,xmm3
+    movdqa      [rsp+80h],xmm0
+    movdqa      xmm0,[rsi+rbp]
+    movdqa      [rsp+10h],xmm13
+    punpckhbw   xmm0,xmm3
+    movdqa      [rsp+50h],xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm1,xmm13
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm2
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,[rsp]
+    movdqa      xmm13,[rsp+40h]
+    movdqa      [rsp+110h],xmm2
+    psubw       xmm1, [rsp+80h]
+    pcmpgtw     xmm13,xmm0
+    pcmpgtw     xmm11,xmm3
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm10,xmm3
+    movdqa      xmm1, [rsp+40h]
+    movdqa      xmm2,xmm1
+    movdqa      xmm3,xmm1
+    pcmpgtw     xmm2,xmm0
+    movdqa      xmm0, [rsp+10h]
+    pand        xmm13,xmm2
+    pand        xmm13,xmm11
+    movdqa      xmm11,[rsp+0C0h]
+    psubw       xmm0,xmm11
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm3,xmm0
+    pand        xmm3,xmm10
+    movdqa      xmm0,[rsp]
+    psubw       xmm0,[rsp+50h]
+    movdqa      xmm2,[rdx]
+    pabsw       xmm0,xmm0
+    por         xmm7,xmm9
+    movdqa      xmm9,[rsp+20h]
+    pcmpgtw     xmm1,xmm0
+    pand        xmm9,xmm7
+    movdqa      xmm7,[rsp+20h]
+    movdqa      xmm0,xmm7
+    pandn       xmm0,xmm12
+    movdqa      xmm12,[rsp+110h]
+    pand        xmm1,xmm10
+    movdqa      xmm10,[rsp+70h]
+    movdqa      [rsp+40h],xmm1
+    movdqa      xmm1,xmm13
+    por         xmm9,xmm0
+    pxor        xmm0,xmm0
+    por         xmm4,xmm6
+    movdqa      xmm6,xmm7
+    punpckhbw   xmm2,xmm0
+    por         xmm15,xmm5
+    movdqa      xmm5,[rsp+20h]
+    movdqa      xmm0,xmm3
+    psllw       xmm2,1
+    pandn       xmm0,xmm11
+    pand        xmm6,xmm4
+    movdqa      xmm4,[rsp]
+    paddw       xmm2,xmm11
+    pand        xmm5,xmm15
+    movdqa      xmm15,[rsp+20h]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm12
+    paddw       xmm2,[rsp+10h]
+    paddw       xmm2,[rsp]
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    pand        xmm2,xmm3
+    por         xmm2,xmm0
+    pand        xmm1,xmm2
+    movdqa      xmm0,xmm13
+    movdqa      xmm2,xmm11
+    pandn       xmm0,xmm11
+    paddw       xmm2,xmm12
+    por         xmm1,xmm0
+    packuswb    xmm9,xmm1
+    movdqa      xmm0,xmm7
+    movdqa      xmm7,[rsp+0A0h]
+    pandn       xmm0,[rsp+0F0h]
+    movdqa      xmm1,xmm3
+    por         xmm6,xmm0
+    movdqa      xmm0,[rsp+10h]
+    paddw       xmm0,xmm4
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm12
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    pandn       xmm0,xmm12
+    movdqa      xmm1,xmm12
+    paddw       xmm1,[rsp+10h]
+    por         xmm2,xmm0
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+0B0h]
+    paddw       xmm1,xmm4
+    packuswb    xmm6,xmm2
+    movdqa      xmm2,xmm3
+    psllw       xmm1,1
+    por         xmm5,xmm0
+    movdqa      xmm0,[rsp+80h]
+    paddw       xmm0,xmm10
+    paddw       xmm1,xmm0
+    paddw       xmm11,xmm1
+    psraw       xmm11,3
+    movdqa      xmm1,xmm12
+    pand        xmm2,xmm11
+    paddw       xmm1,xmm12
+    movdqa      xmm11,[rsp+80h]
+    movdqa      xmm0, [rsp+10h]
+    por         xmm14,[rsp+0E0h]
+    paddw       xmm0,xmm11
+    movdqa      xmm4,xmm15
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    paddw       xmm1,xmm7
+    psraw       xmm1,2
+    pandn       xmm3,xmm1
+    por         xmm2,xmm3
+    movdqa      xmm1,xmm13
+    movdqa      xmm3,[rsp+10h]
+    pandn       xmm0,xmm3
+    pand        xmm1,xmm2
+    movdqa      xmm2,xmm11
+    paddw       xmm2,[rsp]
+    por         xmm1,xmm0
+    movdqa      xmm0,[rsp+0D0h]
+    por         xmm0,xmm8
+    paddw       xmm2,xmm3
+    packuswb    xmm5,xmm1
+    movdqa      xmm8,[rsp+40h]
+    movdqa      xmm1,[rsp+50h]
+    movdqa      xmm3,xmm8
+    pand        xmm4,xmm0
+    psllw       xmm2,1
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+90h]
+    por         xmm4,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm10
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,[rsp]
+    movdqa      xmm2,xmm11
+    paddw       xmm0,xmm12
+    movdqa      xmm12,[rsp]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm0
+    psraw       xmm1,3
+    movdqa      xmm0,xmm8
+    pand        xmm3,xmm1
+    paddw       xmm2,xmm7
+    movdqa      xmm1,xmm13
+    psraw       xmm2,2
+    pandn       xmm0,xmm2
+    por         xmm3,xmm0
+    movdqa      xmm2,[rsp+50h]
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm3
+    paddw       xmm2,xmm11
+    movdqa      xmm3,xmm15
+    por         xmm1,xmm0
+    pand        xmm3,xmm14
+    movdqa      xmm14,[rsp+10h]
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+30h]
+    packuswb    xmm4,xmm1
+    movdqa      xmm1,xmm8
+    por         xmm3,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm14
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm8
+    pandn       xmm0,xmm11
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm11
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm15
+    por         xmm2,xmm0
+    packuswb    xmm3,xmm2
+    movdqa      xmm0,[rsp+100h]
+    por         xmm0,[rsp+120h]
+    pand        xmm1,xmm0
+    movdqa      xmm2,[rcx+rbp]
+    movdqa      xmm7,[rsp+50h]
+    pandn       xmm15,[rsp+60h]
+    lea         r11,[rsp+1D8h]
+    pxor        xmm0,xmm0
+    por         xmm1,xmm15
+    movaps      xmm15,[r11-0A8h]
+    movdqa      [rdi],xmm9
+    movaps      xmm9,[r11-48h]
+    punpckhbw   xmm2,xmm0
+    psllw       xmm2,1
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm7
+    movdqa      [rbx],xmm6
+    movaps      xmm6,[r11-18h]
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm11
+    movaps      xmm11,[r11-68h]
+    paddw       xmm2,xmm12
+    movaps      xmm12,[r11-78h]
+    paddw       xmm2,xmm14
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    movaps      xmm10,[r11-58h]
+    movaps      xmm14,[r11-98h]
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm8
+    pandn       xmm8,xmm7
+    pandn       xmm13,xmm7
+    por         xmm2,xmm8
+    movaps      xmm7,[r11-28h]
+    movaps      xmm8,[r11-38h]
+    movdqa      [r8],xmm5
+    pand        xmm0,xmm2
+    por         xmm0,xmm13
+    packuswb    xmm1,xmm0
+    movaps      xmm13,[r11-88h]
+    movdqa      [rbp],xmm4
+    movdqa      [r10+rbp],xmm3
+    movdqa      [rsi+rbp],xmm1
+    mov         rsp,r11
+    pop         rbp
+    pop         rbx
+    ret
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  mov         r10,  rdx
-  mov         r11,  rcx
-  mov         rcx,  rdi
-  mov         rdx,  rsi
-  mov         rsi,  r10
-  mov         r10,  r9
-  mov         rbp,  r8
-  mov         r8,   rsi
-  mov         r9,   r11
-  sub         rsp,0C8h
-  pxor        xmm1,xmm1
-  mov         rbx,rcx
-  movsxd      r11,r8d
-  movsx       ecx,byte [r10]
-  movsx       r8d,byte [r10+2]
-  mov         rdi,rdx
-  movq        xmm2,[rbx]
-  movq        xmm9,[r11+rbx]
-  movsx       edx,byte [r10+1]
-  mov         word [rsp+2],cx
-  mov         word [rsp],cx
-  movsx       eax,byte [r10+3]
-  mov         word [rsp+6],dx
-  mov         word [rsp+4],dx
-  movdqa      xmm11,xmm1
-  mov         word [rsp+0Eh],ax
-  mov         word [rsp+0Ch],ax
-  lea         eax,[r11+r11]
-  movsxd      rcx,eax
-  mov         rax,rbx
-  mov         rdx,rdi
-  sub         rax,rcx
-  mov         word [rsp+0Ah],r8w
-  mov         word [rsp+8],r8w
-  movdqa      xmm6,[rsp]
-  movdqa      xmm7,xmm6
-  movq        xmm13, [rax]
-  mov         rax,rdi
-  sub         rax,rcx
-  mov         rcx,rbx
-  pcmpgtw     xmm7,xmm1
-  psubw       xmm11,xmm6
-  sub         rcx,r11
-  sub         rdx,r11
-  movq        xmm0,[rax]
-  movsx       eax,r9w
-  movq        xmm15,[rcx]
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rdx]
-  movdqa      xmm4,xmm13
-  punpcklqdq  xmm15,xmm0
-  movq        xmm0, [rdi]
-  punpcklbw   xmm4,xmm1
-  movdqa      xmm12,xmm15
-  punpcklqdq  xmm2,xmm0
-  movq        xmm0, [r11+rdi]
-  punpcklbw   xmm12,xmm1
-  movdqa      xmm14,xmm2
-  punpcklqdq  xmm9,xmm0
-  punpckhbw   xmm2,xmm1
-  punpcklbw   xmm14,xmm1
-  movd        xmm0,eax
-  mov         eax, ebp ; iBeta
-  punpckhbw   xmm13,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm9
-  movdqa      [rsp+10h],xmm2
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm9,xmm1
-  punpcklbw   xmm3,xmm1
-  movdqa      xmm1,xmm14
-  pshufd      xmm10,xmm0,0
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm8,xmm0,0
-  movd        xmm0,eax
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  psubw       xmm1,xmm12
-  movdqa      xmm2,xmm10
-  lea         r11,[rsp+0C8h]
-  psllw       xmm1,2
-  movdqa      xmm0,xmm4
-  psubw       xmm4,xmm12
-  psubw       xmm0,xmm3
-  psubw       xmm3,xmm14
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm11
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm12
-  psubw       xmm0,xmm14
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  movdqa      xmm3,[rsp]
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm9
-  psubw       xmm13,xmm15
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  paddw       xmm12,xmm6
-  psubw       xmm14,xmm6
-  movdqa      xmm2,[rsp+10h]
-  movaps      xmm6,[r11-18h]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm15
-  psubw       xmm9,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm15
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  pmaxsw      xmm11,xmm1
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm10,xmm0
-  pabsw       xmm0,xmm13
-  pminsw      xmm3,xmm11
-  movaps      xmm11,[r11-68h]
-  movaps      xmm13,[rsp+40h]
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm9
-  movaps      xmm9, [r11-48h]
-  pand        xmm10,xmm1
-  pcmpgtw     xmm8,xmm0
-  pand        xmm10,xmm8
-  pand        xmm10,xmm7
-  movaps      xmm8,[r11-38h]
-  movaps      xmm7,[r11-28h]
-  pand        xmm3,xmm10
-  paddw       xmm15,xmm3
-  psubw       xmm2,xmm3
-  movaps      xmm10,[r11-58h]
-  packuswb    xmm12,xmm15
-  movaps      xmm15,[rsp+20h]
-  packuswb    xmm14,xmm2
-  movq        [rcx],xmm12
-  movq        [rbx],xmm14
-  psrldq      xmm12,8
-  psrldq      xmm14,8
-  movq        [rdx],xmm12
-  movaps      xmm12,[r11-78h]
-  movq        [rdi],xmm14
-  movaps      xmm14,[rsp+30h]
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    mov         r10,  rdx
+    mov         r11,  rcx
+    mov         rcx,  rdi
+    mov         rdx,  rsi
+    mov         rsi,  r10
+    mov         r10,  r9
+    mov         rbp,  r8
+    mov         r8,   rsi
+    mov         r9,   r11
+    sub         rsp,0C8h
+    pxor        xmm1,xmm1
+    mov         rbx,rcx
+    movsxd      r11,r8d
+    movsx       ecx,byte [r10]
+    movsx       r8d,byte [r10+2]
+    mov         rdi,rdx
+    movq        xmm2,[rbx]
+    movq        xmm9,[r11+rbx]
+    movsx       edx,byte [r10+1]
+    mov         word [rsp+2],cx
+    mov         word [rsp],cx
+    movsx       eax,byte [r10+3]
+    mov         word [rsp+6],dx
+    mov         word [rsp+4],dx
+    movdqa      xmm11,xmm1
+    mov         word [rsp+0Eh],ax
+    mov         word [rsp+0Ch],ax
+    lea         eax,[r11+r11]
+    movsxd      rcx,eax
+    mov         rax,rbx
+    mov         rdx,rdi
+    sub         rax,rcx
+    mov         word [rsp+0Ah],r8w
+    mov         word [rsp+8],r8w
+    movdqa      xmm6,[rsp]
+    movdqa      xmm7,xmm6
+    movq        xmm13, [rax]
+    mov         rax,rdi
+    sub         rax,rcx
+    mov         rcx,rbx
+    pcmpgtw     xmm7,xmm1
+    psubw       xmm11,xmm6
+    sub         rcx,r11
+    sub         rdx,r11
+    movq        xmm0,[rax]
+    movsx       eax,r9w
+    movq        xmm15,[rcx]
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rdx]
+    movdqa      xmm4,xmm13
+    punpcklqdq  xmm15,xmm0
+    movq        xmm0, [rdi]
+    punpcklbw   xmm4,xmm1
+    movdqa      xmm12,xmm15
+    punpcklqdq  xmm2,xmm0
+    movq        xmm0, [r11+rdi]
+    punpcklbw   xmm12,xmm1
+    movdqa      xmm14,xmm2
+    punpcklqdq  xmm9,xmm0
+    punpckhbw   xmm2,xmm1
+    punpcklbw   xmm14,xmm1
+    movd        xmm0,eax
+    mov         eax, ebp ; iBeta
+    punpckhbw   xmm13,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm9
+    movdqa      [rsp+10h],xmm2
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm9,xmm1
+    punpcklbw   xmm3,xmm1
+    movdqa      xmm1,xmm14
+    pshufd      xmm10,xmm0,0
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm8,xmm0,0
+    movd        xmm0,eax
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    psubw       xmm1,xmm12
+    movdqa      xmm2,xmm10
+    lea         r11,[rsp+0C8h]
+    psllw       xmm1,2
+    movdqa      xmm0,xmm4
+    psubw       xmm4,xmm12
+    psubw       xmm0,xmm3
+    psubw       xmm3,xmm14
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm11
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm12
+    psubw       xmm0,xmm14
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    movdqa      xmm3,[rsp]
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm9
+    psubw       xmm13,xmm15
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    paddw       xmm12,xmm6
+    psubw       xmm14,xmm6
+    movdqa      xmm2,[rsp+10h]
+    movaps      xmm6,[r11-18h]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm15
+    psubw       xmm9,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm15
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    pmaxsw      xmm11,xmm1
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm10,xmm0
+    pabsw       xmm0,xmm13
+    pminsw      xmm3,xmm11
+    movaps      xmm11,[r11-68h]
+    movaps      xmm13,[rsp+40h]
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm9
+    movaps      xmm9, [r11-48h]
+    pand        xmm10,xmm1
+    pcmpgtw     xmm8,xmm0
+    pand        xmm10,xmm8
+    pand        xmm10,xmm7
+    movaps      xmm8,[r11-38h]
+    movaps      xmm7,[r11-28h]
+    pand        xmm3,xmm10
+    paddw       xmm15,xmm3
+    psubw       xmm2,xmm3
+    movaps      xmm10,[r11-58h]
+    packuswb    xmm12,xmm15
+    movaps      xmm15,[rsp+20h]
+    packuswb    xmm14,xmm2
+    movq        [rcx],xmm12
+    movq        [rbx],xmm14
+    psrldq      xmm12,8
+    psrldq      xmm14,8
+    movq        [rdx],xmm12
+    movaps      xmm12,[r11-78h]
+    movq        [rdi],xmm14
+    movaps      xmm14,[rsp+30h]
+    mov         rsp,r11
+    pop         rbp
+    pop         rbx
+    ret
 
 WELS_EXTERN DeblockChromaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
+    mov         rax,rsp
+    push        rbx
+    push        rbp
 
-  mov         rbp, r8
-  mov         r8, rdx
-  mov         r9, rcx
-  mov         rcx, rdi
-  mov         rdx, rsi
+    mov         rbp, r8
+    mov         r8, rdx
+    mov         r9, rcx
+    mov         rcx, rdi
+    mov         rdx, rsi
 
-  sub         rsp,90h
-  pxor        xmm1,xmm1
-  mov         r11,rcx
-  mov         rbx,rdx
-  mov         r10d,r9d
-  movq        xmm13,[r11]
-  lea         eax,[r8+r8]
-  movsxd      r9,eax
-  mov         rax,rcx
-  sub         rax,r9
-  movq        xmm14,[rax]
-  mov         rax,rdx
-  sub         rax,r9
-  movq        xmm0,[rax]
-  movsxd      rax,r8d
-  sub         rcx,rax
-  sub         rdx,rax
-  movq        xmm12,[rax+r11]
-  movq        xmm10,[rcx]
-  punpcklqdq  xmm14,xmm0
-  movdqa      xmm8,xmm14
-  movq        xmm0,[rdx]
-  punpcklbw   xmm8,xmm1
-  punpckhbw   xmm14,xmm1
-  punpcklqdq  xmm10,xmm0
-  movq        xmm0,[rbx]
-  movdqa      xmm5,xmm10
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rax+rbx]
-  punpcklbw   xmm5,xmm1
-  movsx       eax,r10w
-  movdqa      xmm9,xmm13
-  punpcklqdq  xmm12,xmm0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm10,xmm1
-  movd        xmm0,eax
-  mov         eax, ebp   ; iBeta
-  punpckhbw   xmm13,xmm1
-  movdqa      xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm12,xmm1
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm7,xmm1
-  movd        xmm0,eax
-  movdqa      xmm1,xmm8
-  psubw       xmm1,xmm5
-  punpcklwd   xmm0,xmm0
-  movdqa      xmm6,xmm11
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm9
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm10
-  movdqa      xmm1,xmm14
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm10
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm11,xmm0
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm11,xmm2
-  movdqa      xmm0,xmm12
-  movdqa      xmm4,xmm6
-  movdqa      xmm1,xmm8
-  mov         eax,2
-  cwde
-  paddw       xmm1,xmm8
-  psubw       xmm0,xmm13
-  paddw       xmm1,xmm5
-  pabsw       xmm0,xmm0
-  movdqa      xmm2,xmm14
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm14
-  movd        xmm0,eax
-  pand        xmm11,xmm3
-  paddw       xmm7,xmm7
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  paddw       xmm2,xmm12
-  paddw       xmm12,xmm12
-  pshufd      xmm3,xmm0,0
-  paddw       xmm7,xmm9
-  paddw       xmm12,xmm13
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm5
-  paddw       xmm7,xmm8
-  psraw       xmm1,2
-  paddw       xmm12,xmm14
-  paddw       xmm7,xmm3
-  ;movaps      xmm14,[rsp]
-  pand        xmm4,xmm1
-  paddw       xmm12,xmm3
-  psraw       xmm7,2
-  movdqa      xmm1,xmm11
-  por         xmm4,xmm0
-  psraw       xmm12,2
-  paddw       xmm2,xmm3
-  movdqa      xmm0,xmm11
-  pandn       xmm0,xmm10
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  packuswb    xmm4,xmm1
-  movdqa      xmm0,xmm11
-  movdqa      xmm1,xmm6
-  pand        xmm1,xmm7
-  movq        [rcx],xmm4
-  pandn       xmm6,xmm9
-  pandn       xmm11,xmm13
-  pand        xmm0,xmm12
-  por         xmm1,xmm6
-  por         xmm0,xmm11
-  psrldq      xmm4,8
-  packuswb    xmm1,xmm0
-  movq        [r11],xmm1
-  psrldq      xmm1,8
-  movq        [rdx],xmm4
-  lea         r11,[rsp+90h]
-  movq        [rbx],xmm1
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
+    sub         rsp,90h
+    pxor        xmm1,xmm1
+    mov         r11,rcx
+    mov         rbx,rdx
+    mov         r10d,r9d
+    movq        xmm13,[r11]
+    lea         eax,[r8+r8]
+    movsxd      r9,eax
+    mov         rax,rcx
+    sub         rax,r9
+    movq        xmm14,[rax]
+    mov         rax,rdx
+    sub         rax,r9
+    movq        xmm0,[rax]
+    movsxd      rax,r8d
+    sub         rcx,rax
+    sub         rdx,rax
+    movq        xmm12,[rax+r11]
+    movq        xmm10,[rcx]
+    punpcklqdq  xmm14,xmm0
+    movdqa      xmm8,xmm14
+    movq        xmm0,[rdx]
+    punpcklbw   xmm8,xmm1
+    punpckhbw   xmm14,xmm1
+    punpcklqdq  xmm10,xmm0
+    movq        xmm0,[rbx]
+    movdqa      xmm5,xmm10
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rax+rbx]
+    punpcklbw   xmm5,xmm1
+    movsx       eax,r10w
+    movdqa      xmm9,xmm13
+    punpcklqdq  xmm12,xmm0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm10,xmm1
+    movd        xmm0,eax
+    mov         eax, ebp   ; iBeta
+    punpckhbw   xmm13,xmm1
+    movdqa      xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm12,xmm1
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm7,xmm1
+    movd        xmm0,eax
+    movdqa      xmm1,xmm8
+    psubw       xmm1,xmm5
+    punpcklwd   xmm0,xmm0
+    movdqa      xmm6,xmm11
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm9
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm10
+    movdqa      xmm1,xmm14
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm10
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm11,xmm0
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm11,xmm2
+    movdqa      xmm0,xmm12
+    movdqa      xmm4,xmm6
+    movdqa      xmm1,xmm8
+    mov         eax,2
+    cwde
+    paddw       xmm1,xmm8
+    psubw       xmm0,xmm13
+    paddw       xmm1,xmm5
+    pabsw       xmm0,xmm0
+    movdqa      xmm2,xmm14
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm14
+    movd        xmm0,eax
+    pand        xmm11,xmm3
+    paddw       xmm7,xmm7
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    paddw       xmm2,xmm12
+    paddw       xmm12,xmm12
+    pshufd      xmm3,xmm0,0
+    paddw       xmm7,xmm9
+    paddw       xmm12,xmm13
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm5
+    paddw       xmm7,xmm8
+    psraw       xmm1,2
+    paddw       xmm12,xmm14
+    paddw       xmm7,xmm3
+    ;movaps      xmm14,[rsp]
+    pand        xmm4,xmm1
+    paddw       xmm12,xmm3
+    psraw       xmm7,2
+    movdqa      xmm1,xmm11
+    por         xmm4,xmm0
+    psraw       xmm12,2
+    paddw       xmm2,xmm3
+    movdqa      xmm0,xmm11
+    pandn       xmm0,xmm10
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    packuswb    xmm4,xmm1
+    movdqa      xmm0,xmm11
+    movdqa      xmm1,xmm6
+    pand        xmm1,xmm7
+    movq        [rcx],xmm4
+    pandn       xmm6,xmm9
+    pandn       xmm11,xmm13
+    pand        xmm0,xmm12
+    por         xmm1,xmm6
+    por         xmm0,xmm11
+    psrldq      xmm4,8
+    packuswb    xmm1,xmm0
+    movq        [r11],xmm1
+    psrldq      xmm1,8
+    movq        [rdx],xmm4
+    lea         r11,[rsp+90h]
+    movq        [rbx],xmm1
+    mov         rsp,r11
+    pop         rbp
+    pop         rbx
+    ret
 
 WELS_EXTERN DeblockChromaEq4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        r12
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        r12
 
-  mov         rbp,   r8
-  mov         r8,    rdx
-  mov         r9,    rcx
-  mov         rcx,   rdi
-  mov         rdx,   rsi
-  mov         rdi,   rdx
+    mov         rbp,   r8
+    mov         r8,    rdx
+    mov         r9,    rcx
+    mov         rcx,   rdi
+    mov         rdx,   rsi
+    mov         rdi,   rdx
 
-  sub         rsp,140h
-  lea         eax,[r8*4]
-  movsxd      r10,eax
-  mov         eax,[rcx-2]
-  mov         [rsp+10h],eax
-  lea         rbx,[r10+rdx-2]
-  lea         r11,[r10+rcx-2]
+    sub         rsp,140h
+    lea         eax,[r8*4]
+    movsxd      r10,eax
+    mov         eax,[rcx-2]
+    mov         [rsp+10h],eax
+    lea         rbx,[r10+rdx-2]
+    lea         r11,[r10+rcx-2]
 
-  movdqa      xmm5,[rsp+10h]
-  movsxd      r10,r8d
-  mov         eax,[r10+rcx-2]
-  lea         rdx,[r10+r10*2]
-  mov         [rsp+20h],eax
-  mov         eax,[rcx+r10*2-2]
-  mov         [rsp+30h],eax
-  mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h]
-  mov         [rsp+40h],eax
-  mov         eax, [rdi-2]
-  movdqa      xmm4,[rsp+30h]
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rdi-2]
-  movdqa      xmm3,[rsp+40h]
-  mov         [rsp+60h],eax
-  mov         eax,[rdi+r10*2-2]
-  punpckldq   xmm5,[rsp+50h]
-  mov         [rsp+70h],eax
-  mov         eax, [rdx+rdi-2]
-  punpckldq   xmm2, [rsp+60h]
-  mov          [rsp+80h],eax
-  mov         eax,[r11]
-  punpckldq   xmm4, [rsp+70h]
-  mov         [rsp+50h],eax
-  mov         eax,[rbx]
-  punpckldq   xmm3,[rsp+80h]
-  mov         [rsp+60h],eax
-  mov         eax,[r10+r11]
-  movdqa      xmm0, [rsp+50h]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm0,[rsp+50h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+60h],eax
-  mov         eax,[r11+r10*2]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[rbx+r10*2]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  mov         eax, [rdx+r11]
-  movdqa      xmm15,xmm1
-  punpckldq   xmm0,[rsp+60h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax, [rdx+rbx]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm15,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm12,xmm15
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm12,xmm0
-  punpckhdq   xmm15,xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm11,xmm12
-  punpckldq   xmm0,xmm5
-  punpckhdq   xmm1,xmm5
-  punpcklqdq  xmm11,xmm0
-  punpckhqdq  xmm12,xmm0
-  movsx       eax,r9w
-  movdqa      xmm14,xmm15
-  punpcklqdq  xmm14,xmm1
-  punpckhqdq  xmm15,xmm1
-  pxor        xmm1,xmm1
-  movd        xmm0,eax
-  movdqa      xmm4,xmm12
-  movdqa      xmm8,xmm11
-  mov         eax, ebp ; iBeta
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm4,xmm1
-  punpckhbw   xmm12,xmm1
-  movdqa      xmm9,xmm14
-  movdqa      xmm7,xmm15
-  movdqa      xmm10,xmm15
-  pshufd      xmm13,xmm0,0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm14,xmm1
-  movdqa      xmm6,xmm13
-  movd        xmm0,eax
-  movdqa      [rsp],xmm11
-  mov         eax,2
-  cwde
-  punpckhbw   xmm11,xmm1
-  punpckhbw   xmm10,xmm1
-  punpcklbw   xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm8,xmm1
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm4
-  psubw       xmm0,xmm9
-  psubw       xmm1,xmm4
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm12
-  movdqa      xmm1,xmm11
-  psubw       xmm0,xmm14
-  psubw       xmm1,xmm12
-  movdqa      xmm5,xmm6
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm13,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm2,xmm0
-  paddw       xmm1,xmm8
-  movdqa      xmm0,xmm10
-  pand        xmm13,xmm2
-  psubw       xmm0,xmm14
-  paddw       xmm1,xmm4
-  movdqa      xmm2,xmm11
-  pabsw       xmm0,xmm0
-  paddw       xmm2,xmm11
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm12
-  movd        xmm0,eax
-  pand        xmm13,xmm3
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm4
-  paddw       xmm2,xmm3
-  psraw       xmm1,2
-  pand        xmm5,xmm1
-  por         xmm5,xmm0
-  paddw       xmm7,xmm7
-  paddw       xmm10,xmm10
-  psraw       xmm2,2
-  movdqa      xmm1,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm2
-  paddw       xmm7,xmm9
-  por         xmm1,xmm0
-  paddw       xmm10,xmm14
-  paddw       xmm7,xmm8
-  movdqa      xmm0,xmm13
-  packuswb    xmm5,xmm1
-  paddw       xmm7,xmm3
-  paddw       xmm10,xmm11
-  movdqa      xmm1,xmm6
-  paddw       xmm10,xmm3
-  pandn       xmm6,xmm9
-  psraw       xmm7,2
-  pand        xmm1,xmm7
-  psraw       xmm10,2
-  pandn       xmm13,xmm14
-  pand        xmm0,xmm10
-  por         xmm1,xmm6
-  movdqa      xmm6,[rsp]
-  movdqa      xmm4,xmm6
-  por         xmm0,xmm13
-  punpcklbw   xmm4,xmm5
-  punpckhbw   xmm6,xmm5
-  movdqa      xmm3,xmm4
-  packuswb    xmm1,xmm0
-  movdqa      xmm0,xmm1
-  punpckhbw   xmm1,xmm15
-  punpcklbw   xmm0,xmm15
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm6
-  movdqa      xmm2,xmm3
-  punpcklwd   xmm0,xmm1
-  punpckhwd   xmm6,xmm1
-  movdqa      xmm1,xmm4
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm6
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm6
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+10h],xmm0
-  movdqa      [rsp+60h],xmm2
-  movdqa      xmm0,xmm3
-  mov         eax,[rsp+10h]
-  mov         [rcx-2],eax
-  mov         eax,[rsp+60h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [r10+rcx-2],eax
-  movdqa      [rsp+20h],xmm0
-  mov         eax, [rsp+20h]
-  movdqa      [rsp+70h],xmm3
-  mov         [rcx+r10*2-2],eax
-  mov         eax,[rsp+70h]
-  mov         [rdx+rcx-2],eax
-  mov         eax,[rsp+18h]
-  mov         [r11],eax
-  mov         eax,[rsp+68h]
-  mov         [r10+r11],eax
-  mov         eax,[rsp+28h]
-  mov         [r11+r10*2],eax
-  mov         eax,[rsp+78h]
-  mov         [rdx+r11],eax
-  mov         eax,[rsp+14h]
-  mov         [rdi-2],eax
-  mov         eax,[rsp+64h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+24h]
-  mov         [rdi+r10*2-2],eax
-  mov         eax, [rsp+74h]
-  mov         [rdx+rdi-2],eax
-  mov         eax, [rsp+1Ch]
-  mov         [rbx],eax
-  mov         eax, [rsp+6Ch]
-  mov         [r10+rbx],eax
-  mov         eax,[rsp+2Ch]
-  mov         [rbx+r10*2],eax
-  mov         eax,[rsp+7Ch]
-  mov         [rdx+rbx],eax
-  lea         r11,[rsp+140h]
-  mov         rbx, [r11+28h]
-  mov         rsp,r11
-  pop         r12
-  pop         rbp
-  pop         rbx
-  ret
+    movdqa      xmm5,[rsp+10h]
+    movsxd      r10,r8d
+    mov         eax,[r10+rcx-2]
+    lea         rdx,[r10+r10*2]
+    mov         [rsp+20h],eax
+    mov         eax,[rcx+r10*2-2]
+    mov         [rsp+30h],eax
+    mov         eax,[rdx+rcx-2]
+    movdqa      xmm2,[rsp+20h]
+    mov         [rsp+40h],eax
+    mov         eax, [rdi-2]
+    movdqa      xmm4,[rsp+30h]
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rdi-2]
+    movdqa      xmm3,[rsp+40h]
+    mov         [rsp+60h],eax
+    mov         eax,[rdi+r10*2-2]
+    punpckldq   xmm5,[rsp+50h]
+    mov         [rsp+70h],eax
+    mov         eax, [rdx+rdi-2]
+    punpckldq   xmm2, [rsp+60h]
+    mov          [rsp+80h],eax
+    mov         eax,[r11]
+    punpckldq   xmm4, [rsp+70h]
+    mov         [rsp+50h],eax
+    mov         eax,[rbx]
+    punpckldq   xmm3,[rsp+80h]
+    mov         [rsp+60h],eax
+    mov         eax,[r10+r11]
+    movdqa      xmm0, [rsp+50h]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm0,[rsp+50h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+60h],eax
+    mov         eax,[r11+r10*2]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[rbx+r10*2]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    mov         eax, [rdx+r11]
+    movdqa      xmm15,xmm1
+    punpckldq   xmm0,[rsp+60h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax, [rdx+rbx]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm15,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm12,xmm15
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm12,xmm0
+    punpckhdq   xmm15,xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm11,xmm12
+    punpckldq   xmm0,xmm5
+    punpckhdq   xmm1,xmm5
+    punpcklqdq  xmm11,xmm0
+    punpckhqdq  xmm12,xmm0
+    movsx       eax,r9w
+    movdqa      xmm14,xmm15
+    punpcklqdq  xmm14,xmm1
+    punpckhqdq  xmm15,xmm1
+    pxor        xmm1,xmm1
+    movd        xmm0,eax
+    movdqa      xmm4,xmm12
+    movdqa      xmm8,xmm11
+    mov         eax, ebp ; iBeta
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm4,xmm1
+    punpckhbw   xmm12,xmm1
+    movdqa      xmm9,xmm14
+    movdqa      xmm7,xmm15
+    movdqa      xmm10,xmm15
+    pshufd      xmm13,xmm0,0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm14,xmm1
+    movdqa      xmm6,xmm13
+    movd        xmm0,eax
+    movdqa      [rsp],xmm11
+    mov         eax,2
+    cwde
+    punpckhbw   xmm11,xmm1
+    punpckhbw   xmm10,xmm1
+    punpcklbw   xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm8,xmm1
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm4
+    psubw       xmm0,xmm9
+    psubw       xmm1,xmm4
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm12
+    movdqa      xmm1,xmm11
+    psubw       xmm0,xmm14
+    psubw       xmm1,xmm12
+    movdqa      xmm5,xmm6
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm13,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm2,xmm0
+    paddw       xmm1,xmm8
+    movdqa      xmm0,xmm10
+    pand        xmm13,xmm2
+    psubw       xmm0,xmm14
+    paddw       xmm1,xmm4
+    movdqa      xmm2,xmm11
+    pabsw       xmm0,xmm0
+    paddw       xmm2,xmm11
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm12
+    movd        xmm0,eax
+    pand        xmm13,xmm3
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm4
+    paddw       xmm2,xmm3
+    psraw       xmm1,2
+    pand        xmm5,xmm1
+    por         xmm5,xmm0
+    paddw       xmm7,xmm7
+    paddw       xmm10,xmm10
+    psraw       xmm2,2
+    movdqa      xmm1,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm2
+    paddw       xmm7,xmm9
+    por         xmm1,xmm0
+    paddw       xmm10,xmm14
+    paddw       xmm7,xmm8
+    movdqa      xmm0,xmm13
+    packuswb    xmm5,xmm1
+    paddw       xmm7,xmm3
+    paddw       xmm10,xmm11
+    movdqa      xmm1,xmm6
+    paddw       xmm10,xmm3
+    pandn       xmm6,xmm9
+    psraw       xmm7,2
+    pand        xmm1,xmm7
+    psraw       xmm10,2
+    pandn       xmm13,xmm14
+    pand        xmm0,xmm10
+    por         xmm1,xmm6
+    movdqa      xmm6,[rsp]
+    movdqa      xmm4,xmm6
+    por         xmm0,xmm13
+    punpcklbw   xmm4,xmm5
+    punpckhbw   xmm6,xmm5
+    movdqa      xmm3,xmm4
+    packuswb    xmm1,xmm0
+    movdqa      xmm0,xmm1
+    punpckhbw   xmm1,xmm15
+    punpcklbw   xmm0,xmm15
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm6
+    movdqa      xmm2,xmm3
+    punpcklwd   xmm0,xmm1
+    punpckhwd   xmm6,xmm1
+    movdqa      xmm1,xmm4
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm6
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm6
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+10h],xmm0
+    movdqa      [rsp+60h],xmm2
+    movdqa      xmm0,xmm3
+    mov         eax,[rsp+10h]
+    mov         [rcx-2],eax
+    mov         eax,[rsp+60h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [r10+rcx-2],eax
+    movdqa      [rsp+20h],xmm0
+    mov         eax, [rsp+20h]
+    movdqa      [rsp+70h],xmm3
+    mov         [rcx+r10*2-2],eax
+    mov         eax,[rsp+70h]
+    mov         [rdx+rcx-2],eax
+    mov         eax,[rsp+18h]
+    mov         [r11],eax
+    mov         eax,[rsp+68h]
+    mov         [r10+r11],eax
+    mov         eax,[rsp+28h]
+    mov         [r11+r10*2],eax
+    mov         eax,[rsp+78h]
+    mov         [rdx+r11],eax
+    mov         eax,[rsp+14h]
+    mov         [rdi-2],eax
+    mov         eax,[rsp+64h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+24h]
+    mov         [rdi+r10*2-2],eax
+    mov         eax, [rsp+74h]
+    mov         [rdx+rdi-2],eax
+    mov         eax, [rsp+1Ch]
+    mov         [rbx],eax
+    mov         eax, [rsp+6Ch]
+    mov         [r10+rbx],eax
+    mov         eax,[rsp+2Ch]
+    mov         [rbx+r10*2],eax
+    mov         eax,[rsp+7Ch]
+    mov         [rdx+rbx],eax
+    lea         r11,[rsp+140h]
+    mov         rbx, [r11+28h]
+    mov         rsp,r11
+    pop         r12
+    pop         rbp
+    pop         rbx
+    ret
 
 
 WELS_EXTERN DeblockChromaLt4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        r12
-  push        r13
-  push        r14
-  sub         rsp,170h
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        r12
+    push        r13
+    push        r14
+    sub         rsp,170h
 
-  mov         r13,   r8
-  mov         r14,   r9
-  mov         r8,    rdx
-  mov         r9,    rcx
-  mov         rdx,   rdi
-  mov         rcx,   rsi
+    mov         r13,   r8
+    mov         r14,   r9
+    mov         r8,    rdx
+    mov         r9,    rcx
+    mov         rdx,   rdi
+    mov         rcx,   rsi
 
-  movsxd      rsi,r8d
-  lea         eax,[r8*4]
-  mov         r11d,r9d
-  movsxd      r10,eax
-  mov         eax, [rcx-2]
-  mov         r12,rdx
-  mov         [rsp+40h],eax
-  mov         eax, [rsi+rcx-2]
-  lea         rbx,[r10+rcx-2]
-  movdqa      xmm5,[rsp+40h]
-  mov         [rsp+50h],eax
-  mov         eax, [rcx+rsi*2-2]
-  lea         rbp,[r10+rdx-2]
-  movdqa      xmm2, [rsp+50h]
-  mov         [rsp+60h],eax
-  lea         r10,[rsi+rsi*2]
-  mov         rdi,rcx
-  mov         eax,[r10+rcx-2]
-  movdqa      xmm4,[rsp+60h]
-  mov         [rsp+70h],eax
-  mov         eax,[rdx-2]
-  mov         [rsp+80h],eax
-  mov         eax, [rsi+rdx-2]
-  movdqa      xmm3,[rsp+70h]
-  mov         [rsp+90h],eax
-  mov         eax,[rdx+rsi*2-2]
-  punpckldq   xmm5,[rsp+80h]
-  mov         [rsp+0A0h],eax
-  mov         eax, [r10+rdx-2]
-  punpckldq   xmm2,[rsp+90h]
-  mov         [rsp+0B0h],eax
-  mov         eax, [rbx]
-  punpckldq   xmm4,[rsp+0A0h]
-  mov         [rsp+80h],eax
-  mov         eax,[rbp]
-  punpckldq   xmm3,[rsp+0B0h]
-  mov         [rsp+90h],eax
-  mov         eax,[rsi+rbx]
-  movdqa      xmm0,[rsp+80h]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rsi+rbp]
-  movdqa      xmm0,[rsp+80h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+90h],eax
-  mov         eax,[rbx+rsi*2]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rbp+rsi*2]
-  movdqa      xmm0, [rsp+80h]
-  mov         [rsp+90h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm7,xmm1
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax, [r10+rbp]
-  movdqa      xmm0,[rsp+80h]
-  mov         [rsp+90h],eax
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm7,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm6,xmm7
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm6,xmm0
-  punpckhdq   xmm7,xmm0
-  movdqa      xmm0,xmm1
-  punpckldq   xmm0,xmm5
-  mov         rax, r14    ; pTC
-  punpckhdq   xmm1,xmm5
-  movdqa      xmm9,xmm6
-  punpckhqdq  xmm6,xmm0
-  punpcklqdq  xmm9,xmm0
-  movdqa      xmm2,xmm7
-  movdqa      xmm13,xmm6
-  movdqa      xmm4,xmm9
-  movdqa      [rsp+10h],xmm9
-  punpcklqdq  xmm2,xmm1
-  punpckhqdq  xmm7,xmm1
-  pxor        xmm1,xmm1
-  movsx       ecx,byte [rax+3]
-  movsx       edx,byte [rax+2]
-  movsx       r8d,byte [rax+1]
-  movsx       r9d,byte [rax]
-  movdqa      xmm10,xmm1
-  movdqa      xmm15,xmm2
-  punpckhbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm4,xmm1
-  movsx       eax,r11w
-  mov         word [rsp+0Eh],cx
-  mov         word [rsp+0Ch],cx
-  movdqa      xmm3,xmm7
-  movdqa      xmm8,xmm7
-  movdqa      [rsp+20h],xmm7
-  punpcklbw   xmm15,xmm1
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm3,xmm1
-  mov         word [rsp+0Ah],dx
-  mov         word [rsp+8],dx
-  mov         word [rsp+6],r8w
-  movd        xmm0,eax
-  movdqa      [rsp+30h],xmm6
-  punpckhbw   xmm9,xmm1
-  punpckhbw   xmm8,xmm1
-  punpcklwd   xmm0,xmm0
-  mov         eax, r13d   ; iBeta
-  mov         word [rsp+4],r8w
-  mov         word [rsp+2],r9w
-  pshufd      xmm12,xmm0,0
-  mov         word [rsp],r9w
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  movdqa      xmm14, [rsp]
-  movdqa      [rsp],xmm2
-  movdqa      xmm2,xmm12
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  psubw       xmm10,xmm14
-  movd        xmm0,eax
-  movdqa      xmm7,xmm14
-  movdqa      xmm6,xmm14
-  pcmpgtw     xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  movdqa      xmm0,xmm4
-  movdqa      xmm1,xmm15
-  psubw       xmm4,xmm13
-  psubw       xmm0,xmm3
-  psubw       xmm1,xmm13
-  psubw       xmm3,xmm15
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm10
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm11
-  movdqa      xmm0,xmm13
-  psubw       xmm0,xmm15
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm11
-  movdqa      xmm3,[rsp+30h]
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm9
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm8
-  psubw       xmm9,xmm3
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  psubw       xmm15,xmm6
-  paddw       xmm13,xmm6
-  movdqa      xmm2,[rsp]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  psubw       xmm8,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm3
-  movdqa      xmm5,[rsp+10h]
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  movdqa      xmm4,xmm5
-  pabsw       xmm0,xmm0
-  pmaxsw      xmm10,xmm1
-  movdqa      xmm1,xmm11
-  pcmpgtw     xmm12,xmm0
-  pabsw       xmm0,xmm9
-  pminsw      xmm14,xmm10
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm8
-  pcmpgtw     xmm11,xmm0
-  pand        xmm12,xmm1
-  movdqa      xmm1,[rsp+20h]
-  pand        xmm12,xmm11
-  pand        xmm12,xmm7
-  pand        xmm14,xmm12
-  paddw       xmm3,xmm14
-  psubw       xmm2,xmm14
-  packuswb    xmm13,xmm3
-  packuswb    xmm15,xmm2
-  punpcklbw   xmm4,xmm13
-  punpckhbw   xmm5,xmm13
-  movdqa      xmm0,xmm15
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm4
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm2,xmm3
-  movdqa      xmm1,xmm4
-  punpcklwd   xmm0,xmm15
-  punpckhwd   xmm5,xmm15
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm5
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm5
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+40h],xmm0
-  movdqa      xmm0,xmm3
-  movdqa      [rsp+90h],xmm2
-  mov         eax,[rsp+40h]
-  mov         [rdi-2],eax
-  mov         eax, [rsp+90h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [rsi+rdi-2],eax
-  movdqa      [rsp+50h],xmm0
-  mov         eax,[rsp+50h]
-  movdqa      [rsp+0A0h],xmm3
-  mov         [rdi+rsi*2-2],eax
-  mov         eax,[rsp+0A0h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+48h]
-  mov         [rbx],eax
-  mov         eax,[rsp+98h]
-  mov         [rsi+rbx],eax
-  mov         eax,[rsp+58h]
-  mov         [rbx+rsi*2],eax
-  mov         eax, [rsp+0A8h]
-  mov         [r10+rbx],eax
-  mov         eax, [rsp+44h]
-  mov         [r12-2],eax
-  mov         eax,[rsp+94h]
-  mov         [rsi+r12-2],eax
-  mov         eax,[rsp+54h]
-  mov         [r12+rsi*2-2],eax
-  mov         eax, [rsp+0A4h]
-  mov         [r10+r12-2],eax
-  mov         eax,[rsp+4Ch]
-  mov         [rbp],eax
-  mov         eax,[rsp+9Ch]
-  mov         [rsi+rbp],eax
-  mov         eax, [rsp+5Ch]
-  mov         [rbp+rsi*2],eax
-  mov         eax,[rsp+0ACh]
-  mov         [r10+rbp],eax
-  lea         r11,[rsp+170h]
-  mov         rsp,r11
-  pop         r14
-  pop         r13
-  pop         r12
-  pop         rbp
-  pop         rbx
-  ret
+    movsxd      rsi,r8d
+    lea         eax,[r8*4]
+    mov         r11d,r9d
+    movsxd      r10,eax
+    mov         eax, [rcx-2]
+    mov         r12,rdx
+    mov         [rsp+40h],eax
+    mov         eax, [rsi+rcx-2]
+    lea         rbx,[r10+rcx-2]
+    movdqa      xmm5,[rsp+40h]
+    mov         [rsp+50h],eax
+    mov         eax, [rcx+rsi*2-2]
+    lea         rbp,[r10+rdx-2]
+    movdqa      xmm2, [rsp+50h]
+    mov         [rsp+60h],eax
+    lea         r10,[rsi+rsi*2]
+    mov         rdi,rcx
+    mov         eax,[r10+rcx-2]
+    movdqa      xmm4,[rsp+60h]
+    mov         [rsp+70h],eax
+    mov         eax,[rdx-2]
+    mov         [rsp+80h],eax
+    mov         eax, [rsi+rdx-2]
+    movdqa      xmm3,[rsp+70h]
+    mov         [rsp+90h],eax
+    mov         eax,[rdx+rsi*2-2]
+    punpckldq   xmm5,[rsp+80h]
+    mov         [rsp+0A0h],eax
+    mov         eax, [r10+rdx-2]
+    punpckldq   xmm2,[rsp+90h]
+    mov         [rsp+0B0h],eax
+    mov         eax, [rbx]
+    punpckldq   xmm4,[rsp+0A0h]
+    mov         [rsp+80h],eax
+    mov         eax,[rbp]
+    punpckldq   xmm3,[rsp+0B0h]
+    mov         [rsp+90h],eax
+    mov         eax,[rsi+rbx]
+    movdqa      xmm0,[rsp+80h]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rsi+rbp]
+    movdqa      xmm0,[rsp+80h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+90h],eax
+    mov         eax,[rbx+rsi*2]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rbp+rsi*2]
+    movdqa      xmm0, [rsp+80h]
+    mov         [rsp+90h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm7,xmm1
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax, [r10+rbp]
+    movdqa      xmm0,[rsp+80h]
+    mov         [rsp+90h],eax
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm7,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm6,xmm7
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm6,xmm0
+    punpckhdq   xmm7,xmm0
+    movdqa      xmm0,xmm1
+    punpckldq   xmm0,xmm5
+    mov         rax, r14    ; pTC
+    punpckhdq   xmm1,xmm5
+    movdqa      xmm9,xmm6
+    punpckhqdq  xmm6,xmm0
+    punpcklqdq  xmm9,xmm0
+    movdqa      xmm2,xmm7
+    movdqa      xmm13,xmm6
+    movdqa      xmm4,xmm9
+    movdqa      [rsp+10h],xmm9
+    punpcklqdq  xmm2,xmm1
+    punpckhqdq  xmm7,xmm1
+    pxor        xmm1,xmm1
+    movsx       ecx,byte [rax+3]
+    movsx       edx,byte [rax+2]
+    movsx       r8d,byte [rax+1]
+    movsx       r9d,byte [rax]
+    movdqa      xmm10,xmm1
+    movdqa      xmm15,xmm2
+    punpckhbw   xmm2,xmm1
+    punpckhbw   xmm6,xmm1
+    punpcklbw   xmm4,xmm1
+    movsx       eax,r11w
+    mov         word [rsp+0Eh],cx
+    mov         word [rsp+0Ch],cx
+    movdqa      xmm3,xmm7
+    movdqa      xmm8,xmm7
+    movdqa      [rsp+20h],xmm7
+    punpcklbw   xmm15,xmm1
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm3,xmm1
+    mov         word [rsp+0Ah],dx
+    mov         word [rsp+8],dx
+    mov         word [rsp+6],r8w
+    movd        xmm0,eax
+    movdqa      [rsp+30h],xmm6
+    punpckhbw   xmm9,xmm1
+    punpckhbw   xmm8,xmm1
+    punpcklwd   xmm0,xmm0
+    mov         eax, r13d   ; iBeta
+    mov         word [rsp+4],r8w
+    mov         word [rsp+2],r9w
+    pshufd      xmm12,xmm0,0
+    mov         word [rsp],r9w
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    movdqa      xmm14, [rsp]
+    movdqa      [rsp],xmm2
+    movdqa      xmm2,xmm12
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    psubw       xmm10,xmm14
+    movd        xmm0,eax
+    movdqa      xmm7,xmm14
+    movdqa      xmm6,xmm14
+    pcmpgtw     xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    movdqa      xmm0,xmm4
+    movdqa      xmm1,xmm15
+    psubw       xmm4,xmm13
+    psubw       xmm0,xmm3
+    psubw       xmm1,xmm13
+    psubw       xmm3,xmm15
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm10
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm11
+    movdqa      xmm0,xmm13
+    psubw       xmm0,xmm15
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm11
+    movdqa      xmm3,[rsp+30h]
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm9
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm8
+    psubw       xmm9,xmm3
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    psubw       xmm15,xmm6
+    paddw       xmm13,xmm6
+    movdqa      xmm2,[rsp]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    psubw       xmm8,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm3
+    movdqa      xmm5,[rsp+10h]
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    movdqa      xmm4,xmm5
+    pabsw       xmm0,xmm0
+    pmaxsw      xmm10,xmm1
+    movdqa      xmm1,xmm11
+    pcmpgtw     xmm12,xmm0
+    pabsw       xmm0,xmm9
+    pminsw      xmm14,xmm10
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm8
+    pcmpgtw     xmm11,xmm0
+    pand        xmm12,xmm1
+    movdqa      xmm1,[rsp+20h]
+    pand        xmm12,xmm11
+    pand        xmm12,xmm7
+    pand        xmm14,xmm12
+    paddw       xmm3,xmm14
+    psubw       xmm2,xmm14
+    packuswb    xmm13,xmm3
+    packuswb    xmm15,xmm2
+    punpcklbw   xmm4,xmm13
+    punpckhbw   xmm5,xmm13
+    movdqa      xmm0,xmm15
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm4
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm2,xmm3
+    movdqa      xmm1,xmm4
+    punpcklwd   xmm0,xmm15
+    punpckhwd   xmm5,xmm15
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm5
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm5
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+40h],xmm0
+    movdqa      xmm0,xmm3
+    movdqa      [rsp+90h],xmm2
+    mov         eax,[rsp+40h]
+    mov         [rdi-2],eax
+    mov         eax, [rsp+90h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [rsi+rdi-2],eax
+    movdqa      [rsp+50h],xmm0
+    mov         eax,[rsp+50h]
+    movdqa      [rsp+0A0h],xmm3
+    mov         [rdi+rsi*2-2],eax
+    mov         eax,[rsp+0A0h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+48h]
+    mov         [rbx],eax
+    mov         eax,[rsp+98h]
+    mov         [rsi+rbx],eax
+    mov         eax,[rsp+58h]
+    mov         [rbx+rsi*2],eax
+    mov         eax, [rsp+0A8h]
+    mov         [r10+rbx],eax
+    mov         eax, [rsp+44h]
+    mov         [r12-2],eax
+    mov         eax,[rsp+94h]
+    mov         [rsi+r12-2],eax
+    mov         eax,[rsp+54h]
+    mov         [r12+rsi*2-2],eax
+    mov         eax, [rsp+0A4h]
+    mov         [r10+r12-2],eax
+    mov         eax,[rsp+4Ch]
+    mov         [rbp],eax
+    mov         eax,[rsp+9Ch]
+    mov         [rsi+rbp],eax
+    mov         eax, [rsp+5Ch]
+    mov         [rbp+rsi*2],eax
+    mov         eax,[rsp+0ACh]
+    mov         [r10+rbp],eax
+    lea         r11,[rsp+170h]
+    mov         rsp,r11
+    pop         r14
+    pop         r13
+    pop         r12
+    pop         rbp
+    pop         rbx
+    ret
 
 
 
@@ -3233,166 +3233,166 @@
 ;                             int32_t iAlpha, int32_t iBeta)
 ;********************************************************************************
 WELS_EXTERN   DeblockChromaEq4V_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,68h
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx]
-  movq        xmm5,[edx+ecx]
-  push        esi
-  push        edi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  movq        xmm1,[edi]
-  mov         edi,ecx
-  sub         edi,esi
-  movq        xmm2,[edi]
-  punpcklqdq  xmm1,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm2,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm3,[edi]
-  punpcklqdq  xmm2,xmm3
-  movq        xmm3,[eax]
-  punpcklqdq  xmm3,xmm4
-  movq        xmm4,[edx+eax]
-  mov       edx, [ebp + 14h]
-  punpcklqdq  xmm4,xmm5
-  movd        xmm5,edx
-  mov       edx, [ebp + 18h]
-  pxor        xmm0,xmm0
-  movdqa      xmm6,xmm5
-  punpcklwd   xmm6,xmm5
-  pshufd      xmm5,xmm6,0
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,xmm1
-  punpckhbw   xmm1,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+40h],xmm1
-  movdqa      [esp+60h],xmm7
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+10h],xmm7
-  movdqa      xmm7,xmm3
-  punpcklbw   xmm7,xmm0
-  punpckhbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm7,xmm4
-  punpckhbw   xmm4,xmm0
-  punpckhbw   xmm2,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+30h],xmm3
-  movdqa      xmm3,[esp+10h]
-  movdqa      xmm1,xmm3
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      [esp+20h],xmm4
-  movdqa      xmm0,xmm5
-  pcmpgtw     xmm0,xmm1
-  movdqa      xmm1,[esp+60h]
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  pand        xmm0,xmm4
-  movdqa      xmm1,xmm7
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,xmm2
-  psubw       xmm1,[esp+30h]
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  pand        xmm0,xmm4
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,[esp+20h]
-  psubw       xmm1,[esp+30h]
-  pand        xmm5,xmm4
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  pand        xmm5,xmm6
-  mov         edx,2
-  movsx       edx,dx
-  movd        xmm1,edx
-  movdqa      xmm4,xmm1
-  punpcklwd   xmm4,xmm1
-  pshufd      xmm1,xmm4,0
-  movdqa      xmm4,[esp+60h]
-  movdqa      xmm6,xmm4
-  paddw       xmm6,xmm4
-  paddw       xmm6,xmm3
-  paddw       xmm6,xmm7
-  movdqa      [esp+10h],xmm1
-  paddw       xmm6,[esp+10h]
-  psraw       xmm6,2
-  movdqa      xmm4,xmm0
-  pandn       xmm4,xmm3
-  movdqa      xmm3,[esp+40h]
-  movdqa      xmm1,xmm0
-  pand        xmm1,xmm6
-  por         xmm1,xmm4
-  movdqa      xmm6,xmm3
-  paddw       xmm6,xmm3
-  movdqa      xmm3,[esp+10h]
-  paddw       xmm6,xmm2
-  paddw       xmm6,[esp+20h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm4,xmm5
-  pand        xmm4,xmm6
-  movdqa      xmm6,xmm5
-  pandn       xmm6,xmm2
-  por         xmm4,xmm6
-  packuswb    xmm1,xmm4
-  movdqa      xmm4,[esp+50h]
-  movdqa      xmm6,xmm7
-  paddw       xmm6,xmm7
-  paddw       xmm6,xmm4
-  paddw       xmm6,[esp+60h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm2,xmm0
-  pand        xmm2,xmm6
-  pandn       xmm0,xmm4
-  por         xmm2,xmm0
-  movdqa      xmm0,[esp+20h]
-  movdqa      xmm6,xmm0
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[esp+30h]
-  paddw       xmm6,xmm0
-  paddw       xmm6,[esp+40h]
-  movdqa      xmm4,xmm5
-  paddw       xmm6,xmm3
-  movq        [esi],xmm1
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  pandn       xmm5,xmm0
-  por         xmm4,xmm5
-  packuswb    xmm2,xmm4
-  movq        [eax],xmm2
-  psrldq      xmm1,8
-  movq        [edi],xmm1
-  pop         edi
-  psrldq      xmm2,8
-  movq        [ecx],xmm2
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,68h
+    mov         edx,[ebp+10h]      ;  iStride
+    mov         eax,[ebp+8]        ;  pPixCb
+    mov         ecx,[ebp+0Ch]      ;  pPixCr
+    movq        xmm4,[ecx]
+    movq        xmm5,[edx+ecx]
+    push        esi
+    push        edi
+    lea         esi,[edx+edx]
+    mov         edi,eax
+    sub         edi,esi
+    movq        xmm1,[edi]
+    mov         edi,ecx
+    sub         edi,esi
+    movq        xmm2,[edi]
+    punpcklqdq  xmm1,xmm2
+    mov         esi,eax
+    sub         esi,edx
+    movq        xmm2,[esi]
+    mov         edi,ecx
+    sub         edi,edx
+    movq        xmm3,[edi]
+    punpcklqdq  xmm2,xmm3
+    movq        xmm3,[eax]
+    punpcklqdq  xmm3,xmm4
+    movq        xmm4,[edx+eax]
+    mov       edx, [ebp + 14h]
+    punpcklqdq  xmm4,xmm5
+    movd        xmm5,edx
+    mov       edx, [ebp + 18h]
+    pxor        xmm0,xmm0
+    movdqa      xmm6,xmm5
+    punpcklwd   xmm6,xmm5
+    pshufd      xmm5,xmm6,0
+    movd        xmm6,edx
+    movdqa      xmm7,xmm6
+    punpcklwd   xmm7,xmm6
+    pshufd      xmm6,xmm7,0
+    movdqa      xmm7,xmm1
+    punpckhbw   xmm1,xmm0
+    punpcklbw   xmm7,xmm0
+    movdqa      [esp+40h],xmm1
+    movdqa      [esp+60h],xmm7
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm7,xmm0
+    movdqa      [esp+10h],xmm7
+    movdqa      xmm7,xmm3
+    punpcklbw   xmm7,xmm0
+    punpckhbw   xmm3,xmm0
+    movdqa      [esp+50h],xmm7
+    movdqa      xmm7,xmm4
+    punpckhbw   xmm4,xmm0
+    punpckhbw   xmm2,xmm0
+    punpcklbw   xmm7,xmm0
+    movdqa      [esp+30h],xmm3
+    movdqa      xmm3,[esp+10h]
+    movdqa      xmm1,xmm3
+    psubw       xmm1,[esp+50h]
+    pabsw       xmm1,xmm1
+    movdqa      [esp+20h],xmm4
+    movdqa      xmm0,xmm5
+    pcmpgtw     xmm0,xmm1
+    movdqa      xmm1,[esp+60h]
+    psubw       xmm1,xmm3
+    pabsw       xmm1,xmm1
+    movdqa      xmm4,xmm6
+    pcmpgtw     xmm4,xmm1
+    pand        xmm0,xmm4
+    movdqa      xmm1,xmm7
+    psubw       xmm1,[esp+50h]
+    pabsw       xmm1,xmm1
+    movdqa      xmm4,xmm6
+    pcmpgtw     xmm4,xmm1
+    movdqa      xmm1,xmm2
+    psubw       xmm1,[esp+30h]
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm5,xmm1
+    movdqa      xmm1,[esp+40h]
+    pand        xmm0,xmm4
+    psubw       xmm1,xmm2
+    pabsw       xmm1,xmm1
+    movdqa      xmm4,xmm6
+    pcmpgtw     xmm4,xmm1
+    movdqa      xmm1,[esp+20h]
+    psubw       xmm1,[esp+30h]
+    pand        xmm5,xmm4
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm6,xmm1
+    pand        xmm5,xmm6
+    mov         edx,2
+    movsx       edx,dx
+    movd        xmm1,edx
+    movdqa      xmm4,xmm1
+    punpcklwd   xmm4,xmm1
+    pshufd      xmm1,xmm4,0
+    movdqa      xmm4,[esp+60h]
+    movdqa      xmm6,xmm4
+    paddw       xmm6,xmm4
+    paddw       xmm6,xmm3
+    paddw       xmm6,xmm7
+    movdqa      [esp+10h],xmm1
+    paddw       xmm6,[esp+10h]
+    psraw       xmm6,2
+    movdqa      xmm4,xmm0
+    pandn       xmm4,xmm3
+    movdqa      xmm3,[esp+40h]
+    movdqa      xmm1,xmm0
+    pand        xmm1,xmm6
+    por         xmm1,xmm4
+    movdqa      xmm6,xmm3
+    paddw       xmm6,xmm3
+    movdqa      xmm3,[esp+10h]
+    paddw       xmm6,xmm2
+    paddw       xmm6,[esp+20h]
+    paddw       xmm6,xmm3
+    psraw       xmm6,2
+    movdqa      xmm4,xmm5
+    pand        xmm4,xmm6
+    movdqa      xmm6,xmm5
+    pandn       xmm6,xmm2
+    por         xmm4,xmm6
+    packuswb    xmm1,xmm4
+    movdqa      xmm4,[esp+50h]
+    movdqa      xmm6,xmm7
+    paddw       xmm6,xmm7
+    paddw       xmm6,xmm4
+    paddw       xmm6,[esp+60h]
+    paddw       xmm6,xmm3
+    psraw       xmm6,2
+    movdqa      xmm2,xmm0
+    pand        xmm2,xmm6
+    pandn       xmm0,xmm4
+    por         xmm2,xmm0
+    movdqa      xmm0,[esp+20h]
+    movdqa      xmm6,xmm0
+    paddw       xmm6,xmm0
+    movdqa      xmm0,[esp+30h]
+    paddw       xmm6,xmm0
+    paddw       xmm6,[esp+40h]
+    movdqa      xmm4,xmm5
+    paddw       xmm6,xmm3
+    movq        [esi],xmm1
+    psraw       xmm6,2
+    pand        xmm4,xmm6
+    pandn       xmm5,xmm0
+    por         xmm4,xmm5
+    packuswb    xmm2,xmm4
+    movq        [eax],xmm2
+    psrldq      xmm1,8
+    movq        [edi],xmm1
+    pop         edi
+    psrldq      xmm2,8
+    movq        [ecx],xmm2
+    pop         esi
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 ;******************************************************************************
 ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3400,200 +3400,200 @@
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0E4h
-  push        ebx
-  push        esi
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2]
-  push        edi
-  movsx       di,byte [esi+3]
-  mov         word [esp+0Ch],bx
-  movsx       bx,byte  [esi+1]
-  movsx       esi,byte  [esi]
-  mov         word  [esp+0Eh],si
-  movzx       esi,di
-  movd        xmm1,esi
-  movzx       esi,di
-  movd        xmm2,esi
-  mov         si,word  [esp+0Ch]
-  mov         edx, [ebp + 10h]
-  mov         eax, [ebp + 08h]
-  movzx       edi,si
-  movzx       esi,si
-  mov         ecx, [ebp + 0Ch]
-  movd        xmm4,esi
-  movzx       esi,bx
-  movd        xmm5,esi
-  movd        xmm3,edi
-  movzx       esi,bx
-  movd        xmm6,esi
-  mov         si,word [esp+0Eh]
-  movzx       edi,si
-  movzx       esi,si
-  punpcklwd   xmm6,xmm2
-  pxor        xmm0,xmm0
-  movdqa      [esp+40h],xmm0
-  movd        xmm7,edi
-  movd        xmm0,esi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  punpcklwd   xmm0,xmm4
-  movq        xmm4,[edx+ecx]
-  punpcklwd   xmm7,xmm3
-  movq        xmm3,[eax]
-  punpcklwd   xmm0,xmm6
-  movq        xmm6,[edi]
-  punpcklwd   xmm7,xmm5
-  punpcklwd   xmm0,xmm7
-  mov         edi,ecx
-  sub         edi,esi
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+60h],xmm2
-  movq        xmm2, [edi]
-  punpcklqdq  xmm6,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm7,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm2,[edi]
-  punpcklqdq  xmm7,xmm2
-  movq        xmm2,[ecx]
-  punpcklqdq  xmm3,xmm2
-  movq        xmm2,[edx+eax]
-  movsx       edx,word [ebp + 14h]
-  punpcklqdq  xmm2,xmm4
-  movdqa      [esp+0E0h],xmm2
-  movd        xmm2,edx
-  movsx       edx,word [ebp + 18h]
-  movdqa      xmm4,xmm2
-  punpcklwd   xmm4,xmm2
-  movd        xmm2,edx
-  movdqa      xmm5,xmm2
-  punpcklwd   xmm5,xmm2
-  pshufd      xmm2,xmm5,0
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  movdqa      [esp+0D0h],xmm3
-  pshufd      xmm4,xmm4,0
-  movdqa      [esp+30h],xmm2
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+80h],xmm6
-  movdqa      xmm6,[esp+0D0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+70h],xmm6
-  movdqa      xmm6, [esp+0E0h]
-  punpckhbw   xmm6,xmm1
-  movdqa     [esp+90h],xmm6
-  movdqa      xmm5, [esp+0E0h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa       [esp+0A0h],xmm7
-  punpcklbw   xmm3,xmm1
-  mov         edx,4
-  punpcklbw   xmm2,xmm1
-  movsx       edx,dx
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,[esp+30h]
-  movdqa      [esp+20h],xmm6
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1,[esp+60h]
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6, [esp+20h]
-  movdqa      xmm7, [esp+50h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      [esp+10h],xmm0
-  movdqa      xmm6, [esp+10h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+10h],xmm6
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm6,xmm4
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+30h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1,[esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5,[esp+80h]
-  psubw       xmm5,[esp+90h]
-  pand        xmm6,xmm1
-  pand        xmm6,[esp+40h]
-  movdqa      xmm1,[esp+10h]
-  pand        xmm1,xmm6
-  movdqa      xmm6,[esp+70h]
-  movdqa      [esp+30h],xmm1
-  movdqa      xmm1,[esp+0A0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6,[esp+20h]
-  movdqa      xmm5,[esp+60h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+70h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+80h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+90h]
-  pand        xmm4,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+40h]
-  pand        xmm0,xmm4
-  movdqa      xmm4,[esp+30h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  packuswb    xmm2,xmm1
-  movq        [esi],xmm2
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm3,xmm5
-  movq        [eax],xmm3
-  psrldq      xmm2,8
-  movq        [edi],xmm2
-  pop         edi
-  pop         esi
-  psrldq      xmm3,8
-  movq        [ecx],xmm3
-  pop         ebx
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,0E4h
+    push        ebx
+    push        esi
+    mov         esi, [ebp+1Ch]      ;  pTC
+    movsx       ebx, byte [esi+2]
+    push        edi
+    movsx       di,byte [esi+3]
+    mov         word [esp+0Ch],bx
+    movsx       bx,byte  [esi+1]
+    movsx       esi,byte  [esi]
+    mov         word  [esp+0Eh],si
+    movzx       esi,di
+    movd        xmm1,esi
+    movzx       esi,di
+    movd        xmm2,esi
+    mov         si,word  [esp+0Ch]
+    mov         edx, [ebp + 10h]
+    mov         eax, [ebp + 08h]
+    movzx       edi,si
+    movzx       esi,si
+    mov         ecx, [ebp + 0Ch]
+    movd        xmm4,esi
+    movzx       esi,bx
+    movd        xmm5,esi
+    movd        xmm3,edi
+    movzx       esi,bx
+    movd        xmm6,esi
+    mov         si,word [esp+0Eh]
+    movzx       edi,si
+    movzx       esi,si
+    punpcklwd   xmm6,xmm2
+    pxor        xmm0,xmm0
+    movdqa      [esp+40h],xmm0
+    movd        xmm7,edi
+    movd        xmm0,esi
+    lea         esi,[edx+edx]
+    mov         edi,eax
+    sub         edi,esi
+    punpcklwd   xmm5,xmm1
+    movdqa      xmm1,[esp+40h]
+    punpcklwd   xmm0,xmm4
+    movq        xmm4,[edx+ecx]
+    punpcklwd   xmm7,xmm3
+    movq        xmm3,[eax]
+    punpcklwd   xmm0,xmm6
+    movq        xmm6,[edi]
+    punpcklwd   xmm7,xmm5
+    punpcklwd   xmm0,xmm7
+    mov         edi,ecx
+    sub         edi,esi
+    movdqa      xmm2,xmm1
+    psubw       xmm2,xmm0
+    movdqa      [esp+60h],xmm2
+    movq        xmm2, [edi]
+    punpcklqdq  xmm6,xmm2
+    mov         esi,eax
+    sub         esi,edx
+    movq        xmm7,[esi]
+    mov         edi,ecx
+    sub         edi,edx
+    movq        xmm2,[edi]
+    punpcklqdq  xmm7,xmm2
+    movq        xmm2,[ecx]
+    punpcklqdq  xmm3,xmm2
+    movq        xmm2,[edx+eax]
+    movsx       edx,word [ebp + 14h]
+    punpcklqdq  xmm2,xmm4
+    movdqa      [esp+0E0h],xmm2
+    movd        xmm2,edx
+    movsx       edx,word [ebp + 18h]
+    movdqa      xmm4,xmm2
+    punpcklwd   xmm4,xmm2
+    movd        xmm2,edx
+    movdqa      xmm5,xmm2
+    punpcklwd   xmm5,xmm2
+    pshufd      xmm2,xmm5,0
+    movdqa      [esp+50h],xmm2
+    movdqa      xmm2,xmm6
+    punpcklbw   xmm2,xmm1
+    movdqa      [esp+0D0h],xmm3
+    pshufd      xmm4,xmm4,0
+    movdqa      [esp+30h],xmm2
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+80h],xmm6
+    movdqa      xmm6,[esp+0D0h]
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+70h],xmm6
+    movdqa      xmm6, [esp+0E0h]
+    punpckhbw   xmm6,xmm1
+    movdqa     [esp+90h],xmm6
+    movdqa      xmm5, [esp+0E0h]
+    movdqa      xmm2,xmm7
+    punpckhbw   xmm7,xmm1
+    punpcklbw   xmm5,xmm1
+    movdqa       [esp+0A0h],xmm7
+    punpcklbw   xmm3,xmm1
+    mov         edx,4
+    punpcklbw   xmm2,xmm1
+    movsx       edx,dx
+    movd        xmm6,edx
+    movdqa      xmm7,xmm6
+    punpcklwd   xmm7,xmm6
+    pshufd      xmm6,xmm7,0
+    movdqa      xmm7,[esp+30h]
+    movdqa      [esp+20h],xmm6
+    psubw       xmm7,xmm5
+    movdqa      xmm6,xmm0
+    pcmpgtw     xmm6,xmm1
+    movdqa      xmm1,[esp+60h]
+    movdqa      [esp+40h],xmm6
+    movdqa      xmm6,xmm3
+    psubw       xmm6,xmm2
+    psllw       xmm6,2
+    paddw       xmm6,xmm7
+    paddw       xmm6, [esp+20h]
+    movdqa      xmm7, [esp+50h]
+    psraw       xmm6,3
+    pmaxsw      xmm1,xmm6
+    movdqa      [esp+10h],xmm0
+    movdqa      xmm6, [esp+10h]
+    pminsw      xmm6,xmm1
+    movdqa      [esp+10h],xmm6
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    pabsw       xmm1,xmm1
+    movdqa      xmm6,xmm4
+    pcmpgtw     xmm6,xmm1
+    movdqa      xmm1, [esp+30h]
+    psubw       xmm1,xmm2
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm7,xmm1
+    movdqa      xmm1,[esp+50h]
+    pand        xmm6,xmm7
+    movdqa      xmm7,[esp+50h]
+    psubw       xmm5,xmm3
+    pabsw       xmm5,xmm5
+    pcmpgtw     xmm1,xmm5
+    movdqa      xmm5,[esp+80h]
+    psubw       xmm5,[esp+90h]
+    pand        xmm6,xmm1
+    pand        xmm6,[esp+40h]
+    movdqa      xmm1,[esp+10h]
+    pand        xmm1,xmm6
+    movdqa      xmm6,[esp+70h]
+    movdqa      [esp+30h],xmm1
+    movdqa      xmm1,[esp+0A0h]
+    psubw       xmm6,xmm1
+    psllw       xmm6,2
+    paddw       xmm6,xmm5
+    paddw       xmm6,[esp+20h]
+    movdqa      xmm5,[esp+60h]
+    psraw       xmm6,3
+    pmaxsw      xmm5,xmm6
+    pminsw      xmm0,xmm5
+    movdqa      xmm5,[esp+70h]
+    movdqa      xmm6,xmm1
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm4,xmm6
+    movdqa      xmm6,[esp+80h]
+    psubw       xmm6,xmm1
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+90h]
+    pand        xmm4,xmm7
+    movdqa      xmm7,[esp+50h]
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    pand        xmm4,xmm7
+    pand        xmm4,[esp+40h]
+    pand        xmm0,xmm4
+    movdqa      xmm4,[esp+30h]
+    paddw       xmm2,xmm4
+    paddw       xmm1,xmm0
+    packuswb    xmm2,xmm1
+    movq        [esi],xmm2
+    psubw       xmm3,xmm4
+    psubw       xmm5,xmm0
+    packuswb    xmm3,xmm5
+    movq        [eax],xmm3
+    psrldq      xmm2,8
+    movq        [edi],xmm2
+    pop         edi
+    pop         esi
+    psrldq      xmm3,8
+    movq        [ecx],xmm3
+    pop         ebx
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 ;***************************************************************************
 ;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3601,280 +3601,280 @@
 ;***************************************************************************
 
 WELS_EXTERN     DeblockChromaEq4H_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0C8h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+7Ch]
-  push        edi
-  mov         dword [esp+14h],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+0Ch],edx
-  mov         dword [esp+10h],eax
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword  [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+0Ch]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+10h]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  movsx       ecx,word [ebp+14h]
-  movsx       edx,word [ebp+18h]
-  movdqa      xmm6,[esp+80h]
-  movdqa      xmm4,[esp+90h]
-  movdqa      xmm5,[esp+0A0h]
-  movdqa      xmm7,[esp+0B0h]
-  pxor        xmm0,xmm0
-  movd        xmm1,ecx
-  movdqa      xmm2,xmm1
-  punpcklwd   xmm2,xmm1
-  pshufd      xmm1,xmm2,0
-  movd        xmm2,edx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3,xmm6
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm6,[esp+90h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm6,[esp+0A0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,[esp+0B0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+70h],xmm6
-  punpcklbw   xmm7,xmm0
-  punpcklbw   xmm4,xmm0
-  punpcklbw   xmm5,xmm0
-  punpcklbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm6,xmm4
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  movdqa      xmm0,xmm1
-  pcmpgtw     xmm0,xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm4
-  pabsw       xmm6,xmm6
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+30h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm1,xmm6
-  movdqa      xmm6,[esp+60h]
-  psubw       xmm6,[esp+30h]
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+70h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pand        xmm1,xmm7
-  pcmpgtw     xmm2,xmm6
-  pand        xmm1,xmm2
-  mov         eax,2
-  movsx       ecx,ax
-  movd        xmm2,ecx
-  movdqa      xmm6,xmm2
-  punpcklwd   xmm6,xmm2
-  pshufd      xmm2,xmm6,0
-  movdqa      [esp+20h],xmm2
-  movdqa      xmm2,xmm3
-  paddw       xmm2,xmm3
-  paddw       xmm2,xmm4
-  paddw       xmm2,[esp+50h]
-  paddw       xmm2,[esp+20h]
-  psraw       xmm2,2
-  movdqa      xmm6,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm2,xmm0
-  pandn       xmm2,xmm4
-  por         xmm6,xmm2
-  movdqa      xmm2,[esp+60h]
-  movdqa      xmm7,xmm2
-  paddw       xmm7,xmm2
-  paddw       xmm7,[esp+30h]
-  paddw       xmm7,[esp+70h]
-  paddw       xmm7,[esp+20h]
-  movdqa      xmm4,xmm1
-  movdqa      xmm2,xmm1
-  pandn       xmm2,[esp+30h]
-  psraw       xmm7,2
-  pand        xmm4,xmm7
-  por         xmm4,xmm2
-  movdqa      xmm2,[esp+50h]
-  packuswb    xmm6,xmm4
-  movdqa      [esp+90h],xmm6
-  movdqa      xmm6,xmm2
-  paddw       xmm6,xmm2
-  movdqa      xmm2,[esp+20h]
-  paddw       xmm6,xmm5
-  paddw       xmm6,xmm3
-  movdqa      xmm4,xmm0
-  pandn       xmm0,xmm5
-  paddw       xmm6,xmm2
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  por         xmm4,xmm0
-  movdqa      xmm0,[esp+70h]
-  movdqa      xmm5,xmm0
-  paddw       xmm5,xmm0
-  movdqa      xmm0,[esp+40h]
-  paddw       xmm5,xmm0
-  paddw       xmm5,[esp+60h]
-  movdqa      xmm3,xmm1
-  paddw       xmm5,xmm2
-  psraw       xmm5,2
-  pand        xmm3,xmm5
-  pandn       xmm1,xmm0
-  por         xmm3,xmm1
-  packuswb    xmm4,xmm3
-  movdqa      [esp+0A0h],xmm4
-  mov         esi,dword [esp+10h]
-  movdqa      xmm0,[esi]
-  movdqa      xmm1,[esi+10h]
-  movdqa      xmm2,[esi+20h]
-  movdqa      xmm3,[esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+0Ch]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,0C8h
+    mov         ecx,dword [ebp+8]
+    mov         edx,dword [ebp+0Ch]
+    mov         eax,dword [ebp+10h]
+    sub         ecx,2
+    sub         edx,2
+    push        esi
+    lea         esi,[eax+eax*2]
+    mov         dword [esp+18h],ecx
+    mov         dword [esp+4],edx
+    lea         ecx,[ecx+eax*4]
+    lea         edx,[edx+eax*4]
+    lea         eax,[esp+7Ch]
+    push        edi
+    mov         dword [esp+14h],esi
+    mov         dword [esp+18h],ecx
+    mov         dword [esp+0Ch],edx
+    mov         dword [esp+10h],eax
+    mov         esi,dword [esp+1Ch]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+14h]
+    movd        xmm0,dword [esi]
+    movd        xmm1,dword [esi+ecx]
+    movd        xmm2,dword [esi+ecx*2]
+    movd        xmm3,dword [esi+edx]
+    mov         esi,dword  [esp+8]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [esi+ecx]
+    movd        xmm6,dword [esi+ecx*2]
+    movd        xmm7,dword [esi+edx]
+    punpckldq   xmm0,xmm4
+    punpckldq   xmm1,xmm5
+    punpckldq   xmm2,xmm6
+    punpckldq   xmm3,xmm7
+    mov         esi,dword [esp+18h]
+    mov         edi,dword [esp+0Ch]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [edi]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm0,xmm4
+    movd        xmm4,dword [esi+ecx]
+    movd        xmm5,dword [edi+ecx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm1,xmm4
+    movd        xmm4,dword [esi+ecx*2]
+    movd        xmm5,dword [edi+ecx*2]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm2,xmm4
+    movd        xmm4,dword [esi+edx]
+    movd        xmm5,dword [edi+edx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm3,xmm4
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         edi,dword [esp+10h]
+    movdqa      [edi],xmm0
+    movdqa      [edi+10h],xmm5
+    movdqa      [edi+20h],xmm1
+    movdqa      [edi+30h],xmm6
+    movsx       ecx,word [ebp+14h]
+    movsx       edx,word [ebp+18h]
+    movdqa      xmm6,[esp+80h]
+    movdqa      xmm4,[esp+90h]
+    movdqa      xmm5,[esp+0A0h]
+    movdqa      xmm7,[esp+0B0h]
+    pxor        xmm0,xmm0
+    movd        xmm1,ecx
+    movdqa      xmm2,xmm1
+    punpcklwd   xmm2,xmm1
+    pshufd      xmm1,xmm2,0
+    movd        xmm2,edx
+    movdqa      xmm3,xmm2
+    punpcklwd   xmm3,xmm2
+    pshufd      xmm2,xmm3,0
+    movdqa      xmm3,xmm6
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+60h],xmm6
+    movdqa      xmm6,[esp+90h]
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+30h],xmm6
+    movdqa      xmm6,[esp+0A0h]
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+40h],xmm6
+    movdqa      xmm6,[esp+0B0h]
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+70h],xmm6
+    punpcklbw   xmm7,xmm0
+    punpcklbw   xmm4,xmm0
+    punpcklbw   xmm5,xmm0
+    punpcklbw   xmm3,xmm0
+    movdqa      [esp+50h],xmm7
+    movdqa      xmm6,xmm4
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    movdqa      xmm0,xmm1
+    pcmpgtw     xmm0,xmm6
+    movdqa      xmm6,xmm3
+    psubw       xmm6,xmm4
+    pabsw       xmm6,xmm6
+    movdqa      xmm7,xmm2
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+50h]
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pand        xmm0,xmm7
+    movdqa      xmm7,xmm2
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+30h]
+    psubw       xmm6,[esp+40h]
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm1,xmm6
+    movdqa      xmm6,[esp+60h]
+    psubw       xmm6,[esp+30h]
+    pabsw       xmm6,xmm6
+    pand        xmm0,xmm7
+    movdqa      xmm7,xmm2
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+70h]
+    psubw       xmm6,[esp+40h]
+    pabsw       xmm6,xmm6
+    pand        xmm1,xmm7
+    pcmpgtw     xmm2,xmm6
+    pand        xmm1,xmm2
+    mov         eax,2
+    movsx       ecx,ax
+    movd        xmm2,ecx
+    movdqa      xmm6,xmm2
+    punpcklwd   xmm6,xmm2
+    pshufd      xmm2,xmm6,0
+    movdqa      [esp+20h],xmm2
+    movdqa      xmm2,xmm3
+    paddw       xmm2,xmm3
+    paddw       xmm2,xmm4
+    paddw       xmm2,[esp+50h]
+    paddw       xmm2,[esp+20h]
+    psraw       xmm2,2
+    movdqa      xmm6,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm2,xmm0
+    pandn       xmm2,xmm4
+    por         xmm6,xmm2
+    movdqa      xmm2,[esp+60h]
+    movdqa      xmm7,xmm2
+    paddw       xmm7,xmm2
+    paddw       xmm7,[esp+30h]
+    paddw       xmm7,[esp+70h]
+    paddw       xmm7,[esp+20h]
+    movdqa      xmm4,xmm1
+    movdqa      xmm2,xmm1
+    pandn       xmm2,[esp+30h]
+    psraw       xmm7,2
+    pand        xmm4,xmm7
+    por         xmm4,xmm2
+    movdqa      xmm2,[esp+50h]
+    packuswb    xmm6,xmm4
+    movdqa      [esp+90h],xmm6
+    movdqa      xmm6,xmm2
+    paddw       xmm6,xmm2
+    movdqa      xmm2,[esp+20h]
+    paddw       xmm6,xmm5
+    paddw       xmm6,xmm3
+    movdqa      xmm4,xmm0
+    pandn       xmm0,xmm5
+    paddw       xmm6,xmm2
+    psraw       xmm6,2
+    pand        xmm4,xmm6
+    por         xmm4,xmm0
+    movdqa      xmm0,[esp+70h]
+    movdqa      xmm5,xmm0
+    paddw       xmm5,xmm0
+    movdqa      xmm0,[esp+40h]
+    paddw       xmm5,xmm0
+    paddw       xmm5,[esp+60h]
+    movdqa      xmm3,xmm1
+    paddw       xmm5,xmm2
+    psraw       xmm5,2
+    pand        xmm3,xmm5
+    pandn       xmm1,xmm0
+    por         xmm3,xmm1
+    packuswb    xmm4,xmm3
+    movdqa      [esp+0A0h],xmm4
+    mov         esi,dword [esp+10h]
+    movdqa      xmm0,[esi]
+    movdqa      xmm1,[esi+10h]
+    movdqa      xmm2,[esi+20h]
+    movdqa      xmm3,[esi+30h]
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         esi,dword [esp+1Ch]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+14h]
+    mov         edi,dword [esp+8]
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         esi,dword [esp+18h]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         edi,dword [esp+0Ch]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    pop         edi
+    pop         esi
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 ;*******************************************************************************
 ;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3882,308 +3882,308 @@
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockChromaLt4H_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,108h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+10h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+6Ch]
-  push        edi
-  mov         dword [esp+0Ch],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+10h],edx
-  mov         dword [esp+1Ch],eax
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+10h]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+1Ch]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  mov         eax,dword [ebp+1Ch]
-  movsx       cx,byte [eax+3]
-  movsx       dx,byte [eax+2]
-  movsx       si,byte [eax+1]
-  movsx       ax,byte [eax]
-  movzx       edi,cx
-  movzx       ecx,cx
-  movd        xmm2,ecx
-  movzx       ecx,dx
-  movzx       edx,dx
-  movd        xmm3,ecx
-  movd        xmm4,edx
-  movzx       ecx,si
-  movzx       edx,si
-  movd        xmm5,ecx
-  pxor        xmm0,xmm0
-  movd        xmm6,edx
-  movzx       ecx,ax
-  movdqa      [esp+60h],xmm0
-  movzx       edx,ax
-  movsx       eax,word [ebp+14h]
-  punpcklwd   xmm6,xmm2
-  movd        xmm1,edi
-  movd        xmm7,ecx
-  movsx       ecx,word [ebp+18h]
-  movd        xmm0,edx
-  punpcklwd   xmm7,xmm3
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+60h]
-  punpcklwd   xmm7,xmm5
-  movdqa      xmm5,[esp+0A0h]
-  punpcklwd   xmm0,xmm4
-  punpcklwd   xmm0,xmm6
-  movdqa      xmm6, [esp+70h]
-  punpcklwd   xmm0,xmm7
-  movdqa      xmm7,[esp+80h]
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+0D0h],xmm2
-  movd        xmm2,eax
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm4,xmm3,0
-  movd        xmm2,ecx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3, [esp+90h]
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+40h],xmm2
-  movdqa      [esp+0B0h],xmm6
-  movdqa      xmm6,[esp+90h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm2,xmm1
-  punpcklbw   xmm3,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa      [esp+0F0h],xmm7
-  movdqa      [esp+0C0h],xmm6
-  movdqa      xmm6, [esp+0A0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+0E0h],xmm6
-  mov         edx,4
-  movsx       eax,dx
-  movd        xmm6,eax
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm7, [esp+40h]
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm1, [esp+0D0h]
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6,[esp+30h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      xmm7,[esp+50h]
-  movdqa      [esp+20h],xmm0
-  movdqa      xmm6, [esp+20h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+20h],xmm6
-  movdqa      xmm6,xmm4
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+40h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1, [esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5, [esp+0B0h]
-  psubw       xmm5,[esp+0E0h]
-  pand        xmm6,xmm1
-  pand        xmm6, [esp+60h]
-  movdqa      xmm1, [esp+20h]
-  pand        xmm1,xmm6
-  movdqa      xmm6, [esp+0C0h]
-  movdqa      [esp+40h],xmm1
-  movdqa      xmm1, [esp+0F0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6, [esp+30h]
-  movdqa      xmm5, [esp+0D0h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+0C0h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+0B0h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6, [esp+0E0h]
-  pand        xmm4,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+60h]
-  pand        xmm0,xmm4
-  movdqa      xmm4, [esp+40h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm2,xmm1
-  packuswb    xmm3,xmm5
-  movdqa      [esp+80h],xmm2
-  movdqa      [esp+90h],xmm3
-  mov         esi,dword [esp+1Ch]
-  movdqa      xmm0, [esi]
-  movdqa      xmm1, [esi+10h]
-  movdqa      xmm2, [esi+20h]
-  movdqa      xmm3, [esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+10h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,108h
+    mov         ecx,dword [ebp+8]
+    mov         edx,dword [ebp+0Ch]
+    mov         eax,dword [ebp+10h]
+    sub         ecx,2
+    sub         edx,2
+    push        esi
+    lea         esi,[eax+eax*2]
+    mov         dword [esp+10h],ecx
+    mov         dword [esp+4],edx
+    lea         ecx,[ecx+eax*4]
+    lea         edx,[edx+eax*4]
+    lea         eax,[esp+6Ch]
+    push        edi
+    mov         dword [esp+0Ch],esi
+    mov         dword [esp+18h],ecx
+    mov         dword [esp+10h],edx
+    mov         dword [esp+1Ch],eax
+    mov         esi,dword [esp+14h]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+0Ch]
+    movd        xmm0,dword [esi]
+    movd        xmm1,dword [esi+ecx]
+    movd        xmm2,dword [esi+ecx*2]
+    movd        xmm3,dword [esi+edx]
+    mov         esi,dword [esp+8]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [esi+ecx]
+    movd        xmm6,dword [esi+ecx*2]
+    movd        xmm7,dword [esi+edx]
+    punpckldq   xmm0,xmm4
+    punpckldq   xmm1,xmm5
+    punpckldq   xmm2,xmm6
+    punpckldq   xmm3,xmm7
+    mov         esi,dword [esp+18h]
+    mov         edi,dword [esp+10h]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [edi]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm0,xmm4
+    movd        xmm4,dword [esi+ecx]
+    movd        xmm5,dword [edi+ecx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm1,xmm4
+    movd        xmm4,dword [esi+ecx*2]
+    movd        xmm5,dword [edi+ecx*2]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm2,xmm4
+    movd        xmm4,dword [esi+edx]
+    movd        xmm5,dword [edi+edx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm3,xmm4
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         edi,dword [esp+1Ch]
+    movdqa      [edi],xmm0
+    movdqa      [edi+10h],xmm5
+    movdqa      [edi+20h],xmm1
+    movdqa      [edi+30h],xmm6
+    mov         eax,dword [ebp+1Ch]
+    movsx       cx,byte [eax+3]
+    movsx       dx,byte [eax+2]
+    movsx       si,byte [eax+1]
+    movsx       ax,byte [eax]
+    movzx       edi,cx
+    movzx       ecx,cx
+    movd        xmm2,ecx
+    movzx       ecx,dx
+    movzx       edx,dx
+    movd        xmm3,ecx
+    movd        xmm4,edx
+    movzx       ecx,si
+    movzx       edx,si
+    movd        xmm5,ecx
+    pxor        xmm0,xmm0
+    movd        xmm6,edx
+    movzx       ecx,ax
+    movdqa      [esp+60h],xmm0
+    movzx       edx,ax
+    movsx       eax,word [ebp+14h]
+    punpcklwd   xmm6,xmm2
+    movd        xmm1,edi
+    movd        xmm7,ecx
+    movsx       ecx,word [ebp+18h]
+    movd        xmm0,edx
+    punpcklwd   xmm7,xmm3
+    punpcklwd   xmm5,xmm1
+    movdqa      xmm1,[esp+60h]
+    punpcklwd   xmm7,xmm5
+    movdqa      xmm5,[esp+0A0h]
+    punpcklwd   xmm0,xmm4
+    punpcklwd   xmm0,xmm6
+    movdqa      xmm6, [esp+70h]
+    punpcklwd   xmm0,xmm7
+    movdqa      xmm7,[esp+80h]
+    movdqa      xmm2,xmm1
+    psubw       xmm2,xmm0
+    movdqa      [esp+0D0h],xmm2
+    movd        xmm2,eax
+    movdqa      xmm3,xmm2
+    punpcklwd   xmm3,xmm2
+    pshufd      xmm4,xmm3,0
+    movd        xmm2,ecx
+    movdqa      xmm3,xmm2
+    punpcklwd   xmm3,xmm2
+    pshufd      xmm2,xmm3,0
+    movdqa      xmm3, [esp+90h]
+    movdqa      [esp+50h],xmm2
+    movdqa      xmm2,xmm6
+    punpcklbw   xmm2,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+40h],xmm2
+    movdqa      [esp+0B0h],xmm6
+    movdqa      xmm6,[esp+90h]
+    movdqa      xmm2,xmm7
+    punpckhbw   xmm7,xmm1
+    punpckhbw   xmm6,xmm1
+    punpcklbw   xmm2,xmm1
+    punpcklbw   xmm3,xmm1
+    punpcklbw   xmm5,xmm1
+    movdqa      [esp+0F0h],xmm7
+    movdqa      [esp+0C0h],xmm6
+    movdqa      xmm6, [esp+0A0h]
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+0E0h],xmm6
+    mov         edx,4
+    movsx       eax,dx
+    movd        xmm6,eax
+    movdqa      xmm7,xmm6
+    punpcklwd   xmm7,xmm6
+    pshufd      xmm6,xmm7,0
+    movdqa      [esp+30h],xmm6
+    movdqa      xmm7, [esp+40h]
+    psubw       xmm7,xmm5
+    movdqa      xmm6,xmm0
+    pcmpgtw     xmm6,xmm1
+    movdqa      [esp+60h],xmm6
+    movdqa      xmm1, [esp+0D0h]
+    movdqa      xmm6,xmm3
+    psubw       xmm6,xmm2
+    psllw       xmm6,2
+    paddw       xmm6,xmm7
+    paddw       xmm6,[esp+30h]
+    psraw       xmm6,3
+    pmaxsw      xmm1,xmm6
+    movdqa      xmm7,[esp+50h]
+    movdqa      [esp+20h],xmm0
+    movdqa      xmm6, [esp+20h]
+    pminsw      xmm6,xmm1
+    movdqa      [esp+20h],xmm6
+    movdqa      xmm6,xmm4
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm6,xmm1
+    movdqa      xmm1, [esp+40h]
+    psubw       xmm1,xmm2
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm7,xmm1
+    movdqa      xmm1, [esp+50h]
+    pand        xmm6,xmm7
+    movdqa      xmm7, [esp+50h]
+    psubw       xmm5,xmm3
+    pabsw       xmm5,xmm5
+    pcmpgtw     xmm1,xmm5
+    movdqa      xmm5, [esp+0B0h]
+    psubw       xmm5,[esp+0E0h]
+    pand        xmm6,xmm1
+    pand        xmm6, [esp+60h]
+    movdqa      xmm1, [esp+20h]
+    pand        xmm1,xmm6
+    movdqa      xmm6, [esp+0C0h]
+    movdqa      [esp+40h],xmm1
+    movdqa      xmm1, [esp+0F0h]
+    psubw       xmm6,xmm1
+    psllw       xmm6,2
+    paddw       xmm6,xmm5
+    paddw       xmm6, [esp+30h]
+    movdqa      xmm5, [esp+0D0h]
+    psraw       xmm6,3
+    pmaxsw      xmm5,xmm6
+    pminsw      xmm0,xmm5
+    movdqa      xmm5,[esp+0C0h]
+    movdqa      xmm6,xmm1
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm4,xmm6
+    movdqa      xmm6,[esp+0B0h]
+    psubw       xmm6,xmm1
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6, [esp+0E0h]
+    pand        xmm4,xmm7
+    movdqa      xmm7, [esp+50h]
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    pand        xmm4,xmm7
+    pand        xmm4,[esp+60h]
+    pand        xmm0,xmm4
+    movdqa      xmm4, [esp+40h]
+    paddw       xmm2,xmm4
+    paddw       xmm1,xmm0
+    psubw       xmm3,xmm4
+    psubw       xmm5,xmm0
+    packuswb    xmm2,xmm1
+    packuswb    xmm3,xmm5
+    movdqa      [esp+80h],xmm2
+    movdqa      [esp+90h],xmm3
+    mov         esi,dword [esp+1Ch]
+    movdqa      xmm0, [esi]
+    movdqa      xmm1, [esi+10h]
+    movdqa      xmm2, [esi+20h]
+    movdqa      xmm3, [esi+30h]
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         esi,dword [esp+14h]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+0Ch]
+    mov         edi,dword [esp+8]
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         esi,dword [esp+18h]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         edi,dword [esp+10h]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    pop         edi
+    pop         esi
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 
 
@@ -4194,385 +4194,385 @@
 
 
 WELS_EXTERN  DeblockLumaLt4V_ssse3
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
+    push    ebp
+    mov ebp, esp
+    and esp, -16                ; fffffff0H
+    sub esp, 420                ; 000001a4H
+    mov eax, dword [ebp+8]
+    mov ecx, dword [ebp+12]
 
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
+    pxor    xmm0, xmm0
+    push    ebx
+    mov edx, dword [ebp+24]
+    movdqa  [esp+424-384], xmm0
+    push    esi
 
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
+    lea esi, [ecx+ecx*2]
+    push    edi
+    mov edi, eax
+    sub edi, esi
+    movdqa  xmm0, [edi]
 
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
+    lea esi, [ecx+ecx]
+    movdqa  [esp+432-208], xmm0
+    mov edi, eax
+    sub edi, esi
+    movdqa  xmm0, [edi]
+    movdqa  [esp+448-208], xmm0
 
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
+    mov ebx, eax
+    sub ebx, ecx
+    movdqa  xmm0, [ebx]
+    movdqa  [esp+464-208], xmm0
 
-	movdqa	xmm0, [eax]
+    movdqa  xmm0, [eax]
 
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
+    add ecx, eax
+    movdqa  [esp+480-208], xmm0
+    movdqa  xmm0, [ecx]
+    mov dword [esp+432-404], ecx
 
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
+    movsx   ecx, word [ebp+16]
+    movdqa  [esp+496-208], xmm0
+    movdqa  xmm0, [esi+eax]
 
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
+    movsx   si, byte [edx]
+    movdqa  [esp+512-208], xmm0
+    movd    xmm0, ecx
+    movsx   ecx, word [ebp+20]
+    movdqa  xmm1, xmm0
+    punpcklwd xmm1, xmm0
+    pshufd  xmm0, xmm1, 0
+    movdqa  [esp+432-112], xmm0
+    movd    xmm0, ecx
+    movsx   cx, byte [edx+1]
+    movdqa  xmm1, xmm0
+    punpcklwd xmm1, xmm0
+    mov dword [esp+432-408], ebx
+    movzx   ebx, cx
+    pshufd  xmm0, xmm1, 0
+    movd    xmm1, ebx
+    movzx   ebx, cx
+    movd    xmm2, ebx
+    movzx   ebx, cx
+    movzx   ecx, cx
+    movd    xmm4, ecx
+    movzx   ecx, si
+    movd    xmm5, ecx
+    movzx   ecx, si
+    movd    xmm6, ecx
+    movzx   ecx, si
+    movd    xmm7, ecx
+    movzx   ecx, si
+    movdqa  [esp+432-336], xmm0
+    movd    xmm0, ecx
 
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
+    movsx   cx, byte [edx+3]
+    movsx   dx, byte [edx+2]
+    movd    xmm3, ebx
+    punpcklwd xmm0, xmm4
+    movzx   esi, cx
+    punpcklwd xmm6, xmm2
+    punpcklwd xmm5, xmm1
+    punpcklwd xmm0, xmm6
+    punpcklwd xmm7, xmm3
+    punpcklwd xmm7, xmm5
+    punpcklwd xmm0, xmm7
+    movdqa  [esp+432-400], xmm0
+    movd    xmm0, esi
+    movzx   esi, cx
+    movd    xmm2, esi
+    movzx   esi, cx
+    movzx   ecx, cx
+    movd    xmm4, ecx
+    movzx   ecx, dx
+    movd    xmm3, esi
+    movd    xmm5, ecx
+    punpcklwd xmm5, xmm0
 
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
+    movdqa  xmm0, [esp+432-384]
+    movzx   ecx, dx
+    movd    xmm6, ecx
+    movzx   ecx, dx
+    movzx   edx, dx
+    punpcklwd xmm6, xmm2
+    movd    xmm7, ecx
+    movd    xmm1, edx
 
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
+    movdqa  xmm2, [esp+448-208]
+    punpcklbw xmm2, xmm0
 
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
+    mov ecx, 4
+    movsx   edx, cx
+    punpcklwd xmm7, xmm3
+    punpcklwd xmm7, xmm5
+    movdqa  xmm5, [esp+496-208]
+    movdqa  xmm3, [esp+464-208]
+    punpcklbw xmm5, xmm0
+    movdqa  [esp+432-240], xmm5
+    movdqa  xmm5, [esp+512-208]
+    punpcklbw xmm5, xmm0
+    movdqa  [esp+432-352], xmm5
+    punpcklwd xmm1, xmm4
+    movdqa  xmm4, [esp+432-208]
+    punpcklwd xmm1, xmm6
+    movdqa  xmm6, [esp+480-208]
+    punpcklwd xmm1, xmm7
+    punpcklbw xmm6, xmm0
+    punpcklbw xmm3, xmm0
+    punpcklbw xmm4, xmm0
+    movdqa  xmm7, xmm3
+    psubw   xmm7, xmm4
+    pabsw   xmm7, xmm7
+    movdqa  [esp+432-272], xmm4
+    movdqa  xmm4, [esp+432-336]
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-288], xmm5
+    movdqa  xmm7, xmm6
+    psubw   xmm7, [esp+432-352]
+    pabsw   xmm7, xmm7
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-256], xmm5
+    movdqa  xmm5, xmm3
+    pavgw   xmm5, xmm6
+    movdqa  [esp+432-304], xmm5
+    movdqa  xmm5, [esp+432-400]
+    psubw   xmm5, [esp+432-288]
+    psubw   xmm5, [esp+432-256]
+    movdqa  [esp+432-224], xmm5
+    movdqa  xmm5, xmm6
+    psubw   xmm5, xmm3
+    movdqa  [esp+432-32], xmm6
+    psubw   xmm6, [esp+432-240]
+    movdqa  xmm7, xmm5
+    movdqa  [esp+432-384], xmm5
+    movdqa  xmm5, [esp+432-112]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm5, xmm7
+    pabsw   xmm6, xmm6
+    movdqa  xmm7, xmm4
+    pcmpgtw xmm7, xmm6
 
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
+    pand    xmm5, xmm7
+    movdqa  xmm6, xmm3
+    psubw   xmm6, xmm2
+    pabsw   xmm6, xmm6
+    movdqa  xmm7, xmm4
+    pcmpgtw xmm7, xmm6
+    movdqa  xmm6, [esp+432-400]
+    pand    xmm5, xmm7
+    movdqa  xmm7, xmm6
+    pcmpeqw xmm6, xmm0
+    pcmpgtw xmm7, xmm0
+    por xmm7, xmm6
+    pand    xmm5, xmm7
+    movdqa  [esp+432-320], xmm5
+    movd    xmm5, edx
+    movdqa  xmm6, xmm5
+    punpcklwd xmm6, xmm5
+    pshufd  xmm5, xmm6, 0
+    movdqa  [esp+432-336], xmm5
+    movdqa  xmm5, [esp+432-224]
+    movdqa  [esp+432-368], xmm5
+    movdqa  xmm6, xmm0
+    psubw   xmm6, xmm5
+    movdqa  xmm5, [esp+432-384]
+    psllw   xmm5, 2
+    movdqa  xmm7, xmm2
+    psubw   xmm7, [esp+432-240]
+    paddw   xmm7, xmm5
+    paddw   xmm7, [esp+432-336]
+    movdqa  xmm5, [esp+432-368]
+    psraw   xmm7, 3
+    pmaxsw  xmm6, xmm7
+    pminsw  xmm5, xmm6
 
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
+    pand    xmm5, [esp+432-320]
+    movdqa  xmm6, [esp+432-400]
+    movdqa  [esp+432-64], xmm5
+    movdqa  [esp+432-384], xmm6
+    movdqa  xmm5, xmm0
+    psubw   xmm5, xmm6
+    movdqa  [esp+432-368], xmm5
+    movdqa  xmm6, xmm5
+    movdqa  xmm5, [esp+432-272]
+    paddw   xmm5, [esp+432-304]
+    movdqa  xmm7, xmm2
+    paddw   xmm7, xmm2
+    psubw   xmm5, xmm7
+    psraw   xmm5, 1
+    pmaxsw  xmm6, xmm5
+    movdqa  xmm5, [esp+432-384]
+    pminsw  xmm5, xmm6
 
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
+    pand    xmm5, [esp+432-320]
+    pand    xmm5, [esp+432-288]
+    movdqa  xmm6, [esp+432-240]
+    movdqa  [esp+432-96], xmm5
+    movdqa  xmm5, [esp+432-352]
+    paddw   xmm5, [esp+432-304]
+    movdqa  xmm7, xmm6
+    paddw   xmm7, xmm6
+    movdqa  xmm6, [esp+432-368]
+    psubw   xmm5, xmm7
 
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
+    movdqa  xmm7, [esp+496-208]
+    psraw   xmm5, 1
+    pmaxsw  xmm6, xmm5
+    movdqa  xmm5, [esp+432-400]
+    pminsw  xmm5, xmm6
+    pand    xmm5, [esp+432-320]
+    pand    xmm5, [esp+432-256]
+    movdqa  xmm6, [esp+448-208]
+    punpckhbw xmm7, xmm0
+    movdqa  [esp+432-352], xmm7
 
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
+    movdqa  xmm7, [esp+512-208]
+    punpckhbw xmm6, xmm0
+    movdqa  [esp+432-48], xmm5
+    movdqa  xmm5, [esp+432-208]
+    movdqa  [esp+432-368], xmm6
+    movdqa  xmm6, [esp+464-208]
+    punpckhbw xmm7, xmm0
+    punpckhbw xmm5, xmm0
+    movdqa  [esp+432-384], xmm7
+    punpckhbw xmm6, xmm0
+    movdqa  [esp+432-400], xmm6
 
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
+    movdqa  xmm7, [esp+432-400]
+    movdqa  xmm6, [esp+480-208]
+    psubw   xmm7, xmm5
+    movdqa  [esp+432-16], xmm5
+    pabsw   xmm7, xmm7
+    punpckhbw xmm6, xmm0
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-288], xmm5
 
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
+    movdqa  xmm7, xmm6
+    psubw   xmm7, [esp+432-384]
+    pabsw   xmm7, xmm7
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-256], xmm5
 
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
+    movdqa  xmm5, [esp+432-400]
+    movdqa  [esp+432-80], xmm6
+    pavgw   xmm5, xmm6
+    movdqa  [esp+432-304], xmm5
 
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
+    movdqa  xmm5, xmm1
+    psubw   xmm5, [esp+432-288]
+    psubw   xmm5, [esp+432-256]
+    movdqa  [esp+432-224], xmm5
+    movdqa  xmm5, xmm6
+    psubw   xmm5, [esp+432-400]
+    psubw   xmm6, [esp+432-352]
+    movdqa  [esp+432-272], xmm5
+    movdqa  xmm7, xmm5
+    movdqa  xmm5, [esp+432-112]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm5, xmm7
+    movdqa  xmm7, xmm4
+    pabsw   xmm6, xmm6
+    pcmpgtw xmm7, xmm6
+    movdqa  xmm6, [esp+432-368]
 
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
+    pand    xmm5, xmm7
+    movdqa  xmm7, [esp+432-400]
+    psubw   xmm7, xmm6
+    psubw   xmm6, [esp+432-352]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm4, xmm7
+    pand    xmm5, xmm4
 
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
+    paddw   xmm2, [esp+432-96]
+    movdqa  xmm4, xmm1
+    pcmpgtw xmm4, xmm0
+    movdqa  xmm7, xmm1
+    pcmpeqw xmm7, xmm0
+    por xmm4, xmm7
+    pand    xmm5, xmm4
+    movdqa  xmm4, [esp+432-224]
+    movdqa  [esp+432-320], xmm5
+    movdqa  xmm5, [esp+432-272]
+    movdqa  xmm7, xmm0
+    psubw   xmm7, xmm4
+    psubw   xmm0, xmm1
+    psllw   xmm5, 2
+    paddw   xmm6, xmm5
+    paddw   xmm6, [esp+432-336]
+    movdqa  xmm5, [esp+432-368]
+    movdqa  [esp+432-336], xmm0
+    psraw   xmm6, 3
+    pmaxsw  xmm7, xmm6
+    pminsw  xmm4, xmm7
+    pand    xmm4, [esp+432-320]
+    movdqa  xmm6, xmm0
+    movdqa  xmm0, [esp+432-16]
+    paddw   xmm0, [esp+432-304]
+    movdqa  [esp+432-272], xmm4
+    movdqa  xmm4, [esp+432-368]
+    paddw   xmm4, xmm4
+    psubw   xmm0, xmm4
 
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
+    movdqa  xmm4, [esp+432-64]
+    psraw   xmm0, 1
+    pmaxsw  xmm6, xmm0
+    movdqa  xmm0, [esp+432-400]
+    movdqa  xmm7, xmm1
+    pminsw  xmm7, xmm6
+    movdqa  xmm6, [esp+432-320]
+    pand    xmm7, xmm6
+    pand    xmm7, [esp+432-288]
+    paddw   xmm5, xmm7
+    packuswb xmm2, xmm5
+    movdqa  xmm5, [esp+432-272]
+    paddw   xmm0, xmm5
+    paddw   xmm3, xmm4
+    packuswb xmm3, xmm0
 
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
+    movdqa  xmm0, [esp+432-32]
+    psubw   xmm0, xmm4
+    movdqa  xmm4, [esp+432-80]
+    psubw   xmm4, xmm5
 
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
+    movdqa  xmm5, [esp+432-240]
+    paddw   xmm5, [esp+432-48]
+    packuswb xmm0, xmm4
+    movdqa  xmm4, [esp+432-384]
+    paddw   xmm4, [esp+432-304]
+    movdqa  [esp+480-208], xmm0
+    movdqa  xmm0, [esp+432-352]
+    movdqa  xmm7, xmm0
+    paddw   xmm0, xmm0
 
-	mov	ecx, dword [esp+432-408]
+    mov ecx, dword [esp+432-408]
 
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
+    mov edx, dword [esp+432-404]
+    psubw   xmm4, xmm0
+    movdqa  xmm0, [esp+432-336]
+    movdqa  [edi], xmm2
+    psraw   xmm4, 1
+    pmaxsw  xmm0, xmm4
+    pminsw  xmm1, xmm0
+    movdqa  xmm0, [esp+480-208]
 
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
+    pop edi
+    pand    xmm1, xmm6
+    pand    xmm1, [esp+428-256]
+    movdqa  [ecx], xmm3
+    paddw   xmm7, xmm1
+    pop esi
+    packuswb xmm5, xmm7
+    movdqa  [eax], xmm0
+    movdqa  [edx], xmm5
+    pop ebx
+    mov esp, ebp
+    pop ebp
+    ret
 
 
 ;*******************************************************************************
@@ -4583,542 +4583,542 @@
 
 WELS_EXTERN  DeblockLumaEq4V_ssse3
 
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
+    push    ebp
+    mov ebp, esp
+    and esp, -16                ; fffffff0H
+    sub esp, 628                ; 00000274H
+    mov eax, dword [ebp+8]
+    mov ecx, dword [ebp+12]
+    push    ebx
+    push    esi
 
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
+    lea edx, [ecx*4]
+    pxor    xmm0, xmm0
+    movdqa  xmm2, xmm0
 
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
+    movdqa  xmm0, [ecx+eax]
+    mov esi, eax
+    sub esi, edx
+    movdqa  xmm3, [esi]
+    movdqa  xmm5, [eax]
+    push    edi
+    lea edi, [ecx+ecx]
+    lea ebx, [ecx+ecx*2]
+    mov dword [esp+640-600], edi
+    mov esi, eax
+    sub esi, edi
+    movdqa  xmm1, [esi]
+    movdqa   [esp+720-272], xmm0
+    mov edi, eax
+    sub edi, ecx
+    movdqa  xmm4, [edi]
+    add ecx, eax
+    mov dword [esp+640-596], ecx
 
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
+    mov ecx, dword [esp+640-600]
+    movdqa  xmm0, [ecx+eax]
+    movdqa   [esp+736-272], xmm0
 
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
+    movdqa  xmm0, [eax+ebx]
+    mov edx, eax
+    sub edx, ebx
 
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
+    movsx   ebx, word [ebp+16]
+    movdqa  xmm6, [edx]
+    add ecx, eax
+    movdqa   [esp+752-272], xmm0
+    movd    xmm0, ebx
 
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
+    movsx   ebx, word [ebp+20]
+    movdqa  xmm7, xmm0
+    punpcklwd xmm7, xmm0
+    pshufd  xmm0, xmm7, 0
+    movdqa   [esp+640-320], xmm0
+    movd    xmm0, ebx
+    movdqa  xmm7, xmm0
+    punpcklwd xmm7, xmm0
+    pshufd  xmm0, xmm7, 0
 
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
+    movdqa  xmm7, [esp+736-272]
+    punpcklbw xmm7, xmm2
+    movdqa   [esp+640-416], xmm7
+    movdqa   [esp+640-512], xmm0
+    movdqa  xmm0, xmm1
+    movdqa   [esp+672-272], xmm1
+    movdqa  xmm1, xmm4
+    movdqa   [esp+704-272], xmm5
+    punpcklbw xmm5, xmm2
+    punpcklbw xmm1, xmm2
 
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
+    movdqa  xmm7, xmm5
+    psubw   xmm7, xmm1
+    pabsw   xmm7, xmm7
+    movdqa   [esp+640-560], xmm7
+    punpcklbw xmm0, xmm2
+    movdqa   [esp+688-272], xmm4
+    movdqa  xmm4, [esp+720-272]
+    movdqa   [esp+640-480], xmm0
 
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
+    movdqa  xmm7, xmm1
+    psubw   xmm7, xmm0
 
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
+    movdqa  xmm0, [esp+640-512]
+    pabsw   xmm7, xmm7
+    punpcklbw xmm4, xmm2
+    pcmpgtw xmm0, xmm7
+    movdqa   [esp+640-384], xmm4
+    movdqa  xmm7, xmm5
+    psubw   xmm7, xmm4
+    movdqa  xmm4, [esp+640-512]
+    movdqa   [esp+656-272], xmm6
+    punpcklbw xmm6, xmm2
+    pabsw   xmm7, xmm7
+    movdqa   [esp+640-48], xmm2
+    movdqa   [esp+640-368], xmm6
+    movdqa   [esp+640-144], xmm1
+    movdqa   [esp+640-400], xmm5
+    pcmpgtw xmm4, xmm7
+    pand    xmm0, xmm4
+    movdqa  xmm4, [esp+640-320]
+    pcmpgtw xmm4, [esp+640-560]
+    pand    xmm0, xmm4
 
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
+    mov ebx, 2
+    movsx   ebx, bx
+    movd    xmm4, ebx
+    movdqa  xmm7, xmm4
+    punpcklwd xmm7, xmm4
+    movdqa  xmm4, [esp+640-320]
+    psraw   xmm4, 2
+    pshufd  xmm7, xmm7, 0
+    paddw   xmm4, xmm7
+    movdqa   [esp+640-576], xmm4
+    pcmpgtw xmm4, [esp+640-560]
+    movdqa   [esp+640-560], xmm4
 
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
+    movdqa  xmm4, [esp+640-512]
+    movdqa   [esp+640-624], xmm7
+    movdqa  xmm7, xmm1
+    psubw   xmm7, xmm6
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm4, xmm7
 
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
+    pand    xmm4, [esp+640-560]
+    movdqa   [esp+640-544], xmm4
+    movdqa  xmm4, [esp+640-512]
+    movdqa  xmm7, xmm5
+    psubw   xmm7, [esp+640-416]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm4, xmm7
 
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
+    pand    xmm4, [esp+640-560]
+    movdqa   [esp+640-560], xmm4
 
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
+    movdqa  xmm4, [esp+640-544]
+    pandn   xmm4, xmm6
+    movdqa   [esp+640-16], xmm4
+    mov ebx, 4
+    movsx   ebx, bx
+    movd    xmm4, ebx
+    movdqa  xmm7, xmm4
+    punpcklwd xmm7, xmm4
+    movdqa  xmm4, xmm3
+    punpcklbw xmm4, xmm2
+    psllw   xmm4, 1
+    paddw   xmm4, xmm6
+    paddw   xmm4, xmm6
+    paddw   xmm4, xmm6
+    paddw   xmm4, [esp+640-480]
 
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
+    movdqa  xmm6, [esp+640-560]
+    pshufd  xmm7, xmm7, 0
+    paddw   xmm4, xmm1
+    movdqa   [esp+640-592], xmm7
+    paddw   xmm4, xmm5
+    paddw   xmm4, xmm7
+    movdqa  xmm7, [esp+640-416]
+    pandn   xmm6, xmm7
+    movdqa   [esp+640-80], xmm6
+    movdqa  xmm6, [esp+752-272]
+    punpcklbw xmm6, xmm2
+    psllw   xmm6, 1
+    paddw   xmm6, xmm7
+    paddw   xmm6, xmm7
+    paddw   xmm6, xmm7
+    paddw   xmm6, [esp+640-384]
 
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
+    movdqa  xmm7, [esp+640-480]
+    paddw   xmm6, xmm5
+    paddw   xmm6, xmm1
+    paddw   xmm6, [esp+640-592]
+    psraw   xmm6, 3
+    pand    xmm6, [esp+640-560]
+    movdqa   [esp+640-112], xmm6
+    movdqa  xmm6, [esp+640-544]
+    pandn   xmm6, xmm7
+    movdqa   [esp+640-336], xmm6
+    movdqa  xmm6, [esp+640-544]
+    movdqa   [esp+640-528], xmm6
+    movdqa  xmm6, [esp+640-368]
+    paddw   xmm6, xmm7
+    movdqa  xmm7, xmm1
+    psraw   xmm4, 3
+    pand    xmm4, [esp+640-544]
+    paddw   xmm7, xmm5
+    paddw   xmm6, xmm7
+    paddw   xmm6, [esp+640-624]
+    movdqa  xmm7, [esp+640-528]
 
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
+    paddw   xmm5, xmm1
+    psraw   xmm6, 2
+    pand    xmm7, xmm6
 
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
+    movdqa  xmm6, [esp+640-384]
+    movdqa   [esp+640-64], xmm7
+    movdqa  xmm7, [esp+640-560]
+    pandn   xmm7, xmm6
+    movdqa   [esp+640-304], xmm7
+    movdqa  xmm7, [esp+640-560]
+    movdqa   [esp+640-528], xmm7
+    movdqa  xmm7, [esp+640-416]
+    paddw   xmm7, xmm6
+    paddw   xmm7, xmm5
+    paddw   xmm7, [esp+640-624]
+    movdqa  xmm5, [esp+640-528]
+    psraw   xmm7, 2
+    pand    xmm5, xmm7
+    movdqa   [esp+640-32], xmm5
 
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
+    movdqa  xmm5, [esp+640-544]
+    movdqa   [esp+640-528], xmm5
+    movdqa  xmm5, [esp+640-480]
+    movdqa  xmm7, xmm5
+    paddw   xmm7, xmm5
+    movdqa  xmm5, xmm1
+    paddw   xmm5, xmm6
+    paddw   xmm6, [esp+640-592]
+    paddw   xmm7, xmm5
+    paddw   xmm7, [esp+640-624]
+    movdqa  xmm5, [esp+640-528]
+    psraw   xmm7, 2
+    pandn   xmm5, xmm7
+    movdqa  xmm7, [esp+640-480]
+    paddw   xmm7, xmm1
+    paddw   xmm7, [esp+640-400]
+    movdqa  xmm1, [esp+640-544]
+    movdqa   [esp+640-352], xmm5
+    movdqa  xmm5, [esp+640-368]
+    psllw   xmm7, 1
+    paddw   xmm7, xmm6
+    paddw   xmm5, xmm7
 
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
+    movdqa  xmm7, [esp+640-400]
+    psraw   xmm5, 3
+    pand    xmm1, xmm5
+    movdqa  xmm5, [esp+640-480]
+    movdqa   [esp+640-96], xmm1
+    movdqa  xmm1, [esp+640-560]
+    movdqa   [esp+640-528], xmm1
+    movdqa  xmm1, [esp+640-384]
+    movdqa  xmm6, xmm1
+    paddw   xmm6, xmm1
+    paddw   xmm1, [esp+640-400]
+    paddw   xmm1, [esp+640-144]
+    paddw   xmm7, xmm5
+    paddw   xmm5, [esp+640-592]
+    paddw   xmm6, xmm7
+    paddw   xmm6, [esp+640-624]
+    movdqa  xmm7, [esp+640-528]
+    psraw   xmm6, 2
+    psllw   xmm1, 1
+    paddw   xmm1, xmm5
 
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
+    movdqa  xmm5, [esp+656-272]
+    pandn   xmm7, xmm6
+    movdqa  xmm6, [esp+640-416]
+    paddw   xmm6, xmm1
+    movdqa  xmm1, [esp+640-560]
+    psraw   xmm6, 3
+    pand    xmm1, xmm6
 
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
+    movdqa  xmm6, [esp+704-272]
+    movdqa   [esp+640-128], xmm1
+    movdqa  xmm1, [esp+672-272]
+    punpckhbw xmm1, xmm2
+    movdqa   [esp+640-448], xmm1
+    movdqa  xmm1, [esp+688-272]
+    punpckhbw xmm1, xmm2
+    punpckhbw xmm6, xmm2
+    movdqa   [esp+640-288], xmm7
+    punpckhbw xmm5, xmm2
+    movdqa   [esp+640-496], xmm1
+    movdqa   [esp+640-432], xmm6
 
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
+    movdqa  xmm7, [esp+720-272]
+    punpckhbw xmm7, xmm2
+    movdqa   [esp+640-464], xmm7
 
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
+    movdqa  xmm7, [esp+736-272]
+    punpckhbw xmm7, xmm2
+    movdqa   [esp+640-528], xmm7
 
-	movdqa	xmm7, xmm6
+    movdqa  xmm7, xmm6
 
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
+    psubw   xmm6, [esp+640-464]
+    psubw   xmm7, xmm1
+    pabsw   xmm7, xmm7
+    movdqa   [esp+640-560], xmm7
+    por xmm4, [esp+640-16]
+    pabsw   xmm6, xmm6
+    movdqa  xmm7, xmm1
+    psubw   xmm7, [esp+640-448]
 
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
+    movdqa  xmm1, [esp+640-512]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm1, xmm7
+    movdqa  xmm7, [esp+640-512]
+    pcmpgtw xmm7, xmm6
+    movdqa  xmm6, [esp+640-320]
+    pand    xmm1, xmm7
+    movdqa  xmm7, [esp+640-560]
+    pcmpgtw xmm6, xmm7
+    pand    xmm1, xmm6
 
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
+    movdqa  xmm6, [esp+640-576]
+    pcmpgtw xmm6, xmm7
 
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
+    movdqa  xmm7, [esp+640-496]
+    punpckhbw xmm3, xmm2
+    movdqa   [esp+640-560], xmm6
+    movdqa  xmm6, [esp+640-512]
+    psubw   xmm7, xmm5
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm6, xmm7
 
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
+    pand    xmm6, [esp+640-560]
+    movdqa  xmm7, [esp+640-432]
+    psubw   xmm7, [esp+640-528]
 
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
+    psllw   xmm3, 1
+    movdqa   [esp+640-544], xmm6
+    movdqa  xmm6, [esp+640-512]
 
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
+    movdqa  xmm2, [esp+640-544]
+    paddw   xmm3, xmm5
+    paddw   xmm3, xmm5
+    paddw   xmm3, xmm5
+    paddw   xmm3, [esp+640-448]
+    paddw   xmm3, [esp+640-496]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm6, xmm7
+    pand    xmm6, [esp+640-560]
+    movdqa   [esp+640-560], xmm6
 
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
+    movdqa  xmm6, xmm0
+    pand    xmm6, xmm4
+    movdqa  xmm4, xmm0
+    pandn   xmm4, [esp+640-368]
+    por xmm6, xmm4
+    movdqa  xmm4, [esp+640-432]
+    paddw   xmm3, xmm4
+    paddw   xmm3, [esp+640-592]
+    psraw   xmm3, 3
+    pand    xmm3, xmm2
+    pandn   xmm2, xmm5
+    por xmm3, xmm2
+    movdqa  xmm7, xmm1
+    pand    xmm7, xmm3
+    movdqa  xmm3, [esp+640-64]
+    por xmm3, [esp+640-336]
+    movdqa  xmm2, xmm1
+    pandn   xmm2, xmm5
+    por xmm7, xmm2
 
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
+    movdqa  xmm2, xmm0
+    pand    xmm2, xmm3
+    movdqa  xmm3, xmm0
+    pandn   xmm3, [esp+640-480]
+    por xmm2, xmm3
+    packuswb xmm6, xmm7
+    movdqa   [esp+640-336], xmm2
+    movdqa   [esp+656-272], xmm6
+    movdqa  xmm6, [esp+640-544]
+    movdqa  xmm2, xmm5
+    paddw   xmm2, [esp+640-448]
+    movdqa  xmm3, xmm1
+    movdqa  xmm7, [esp+640-496]
+    paddw   xmm7, xmm4
+    paddw   xmm2, xmm7
+    paddw   xmm2, [esp+640-624]
+    movdqa  xmm7, [esp+640-544]
+    psraw   xmm2, 2
+    pand    xmm6, xmm2
+    movdqa  xmm2, [esp+640-448]
+    pandn   xmm7, xmm2
+    por xmm6, xmm7
+    pand    xmm3, xmm6
+    movdqa  xmm6, xmm1
+    pandn   xmm6, xmm2
+    paddw   xmm2, [esp+640-496]
+    paddw   xmm2, xmm4
+    por xmm3, xmm6
+    movdqa  xmm6, [esp+640-336]
+    packuswb xmm6, xmm3
+    psllw   xmm2, 1
+    movdqa   [esp+672-272], xmm6
+    movdqa  xmm6, [esp+640-96]
+    por xmm6, [esp+640-352]
 
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
+    movdqa  xmm3, xmm0
+    pand    xmm3, xmm6
+    movdqa  xmm6, xmm0
+    pandn   xmm6, [esp+640-144]
+    por xmm3, xmm6
+    movdqa  xmm6, [esp+640-544]
+    movdqa   [esp+640-352], xmm3
+    movdqa  xmm3, [esp+640-464]
+    paddw   xmm3, [esp+640-592]
+    paddw   xmm2, xmm3
+    movdqa  xmm3, [esp+640-448]
+    paddw   xmm5, xmm2
+    movdqa  xmm2, [esp+640-496]
+    psraw   xmm5, 3
+    pand    xmm6, xmm5
+    movdqa  xmm5, [esp+640-464]
+    paddw   xmm2, xmm5
+    paddw   xmm5, [esp+640-432]
+    movdqa  xmm4, xmm3
+    paddw   xmm4, xmm3
+    paddw   xmm4, xmm2
+    paddw   xmm4, [esp+640-624]
+    movdqa  xmm2, [esp+640-544]
+    paddw   xmm3, [esp+640-592]
+    psraw   xmm4, 2
+    pandn   xmm2, xmm4
+    por xmm6, xmm2
+    movdqa  xmm7, xmm1
+    pand    xmm7, xmm6
+    movdqa  xmm6, [esp+640-496]
+    movdqa  xmm2, xmm1
+    pandn   xmm2, xmm6
+    por xmm7, xmm2
+    movdqa  xmm2, [esp+640-352]
+    packuswb xmm2, xmm7
+    movdqa   [esp+688-272], xmm2
+    movdqa  xmm2, [esp+640-128]
+    por xmm2, [esp+640-288]
 
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
+    movdqa  xmm4, xmm0
+    pand    xmm4, xmm2
+    paddw   xmm5, xmm6
+    movdqa  xmm2, xmm0
+    pandn   xmm2, [esp+640-400]
+    por xmm4, xmm2
+    movdqa  xmm2, [esp+640-528]
+    psllw   xmm5, 1
+    paddw   xmm5, xmm3
+    movdqa  xmm3, [esp+640-560]
+    paddw   xmm2, xmm5
+    psraw   xmm2, 3
+    movdqa   [esp+640-288], xmm4
+    movdqa  xmm4, [esp+640-560]
+    pand    xmm4, xmm2
+    movdqa  xmm2, [esp+640-464]
+    movdqa  xmm5, xmm2
+    paddw   xmm5, xmm2
+    movdqa  xmm2, [esp+640-432]
+    paddw   xmm2, [esp+640-448]
+    movdqa  xmm7, xmm1
+    paddw   xmm5, xmm2
+    paddw   xmm5, [esp+640-624]
+    movdqa  xmm6, [esp+640-560]
+    psraw   xmm5, 2
+    pandn   xmm3, xmm5
+    por xmm4, xmm3
+    movdqa  xmm3, [esp+640-32]
+    por xmm3, [esp+640-304]
+    pand    xmm7, xmm4
+    movdqa  xmm4, [esp+640-432]
+    movdqa  xmm5, [esp+640-464]
+    movdqa  xmm2, xmm1
+    pandn   xmm2, xmm4
+    paddw   xmm4, [esp+640-496]
+    por xmm7, xmm2
+    movdqa  xmm2, [esp+640-288]
+    packuswb xmm2, xmm7
+    movdqa   [esp+704-272], xmm2
 
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
+    movdqa  xmm2, xmm0
+    pand    xmm2, xmm3
+    movdqa  xmm3, xmm0
+    pandn   xmm3, [esp+640-384]
+    por xmm2, xmm3
+    movdqa   [esp+640-304], xmm2
+    movdqa  xmm2, [esp+640-528]
+    movdqa  xmm3, xmm2
+    paddw   xmm3, [esp+640-464]
+    paddw   xmm3, xmm4
+    paddw   xmm3, [esp+640-624]
+    psraw   xmm3, 2
+    pand    xmm6, xmm3
+    movdqa  xmm3, [esp+640-560]
+    movdqa  xmm4, xmm3
+    pandn   xmm4, xmm5
+    por xmm6, xmm4
+    movdqa  xmm7, xmm1
+    pand    xmm7, xmm6
+    movdqa  xmm6, [esp+640-304]
+    movdqa  xmm4, xmm1
+    pandn   xmm4, xmm5
+    por xmm7, xmm4
 
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
+    movdqa  xmm4, xmm0
+    pandn   xmm0, [esp+640-416]
+    packuswb xmm6, xmm7
+    movdqa  xmm7, [esp+640-112]
+    por xmm7, [esp+640-80]
+    pand    xmm4, xmm7
+    por xmm4, xmm0
+    movdqa  xmm0, [esp+752-272]
+    punpckhbw xmm0, [esp+640-48]
+    psllw   xmm0, 1
+    paddw   xmm0, xmm2
+    paddw   xmm0, xmm2
+    paddw   xmm0, xmm2
+    paddw   xmm0, xmm5
+    paddw   xmm0, [esp+640-432]
+    paddw   xmm0, [esp+640-496]
+    paddw   xmm0, [esp+640-592]
+    psraw   xmm0, 3
+    pand    xmm0, xmm3
+    movdqa  xmm7, xmm1
+    pandn   xmm3, xmm2
+    por xmm0, xmm3
+    pand    xmm7, xmm0
 
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
+    movdqa  xmm0, [esp+656-272]
+    movdqa   [edx], xmm0
 
-	movdqa	xmm0, [esp+672-272]
+    movdqa  xmm0, [esp+672-272]
 
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
+    mov edx, dword [esp+640-596]
+    movdqa   [esi], xmm0
+    movdqa  xmm0, [esp+688-272]
+    movdqa   [edi], xmm0
+    movdqa  xmm0, [esp+704-272]
 
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
+    pop edi
+    pandn   xmm1, xmm2
+    movdqa   [eax], xmm0
+    por xmm7, xmm1
+    pop esi
+    packuswb xmm4, xmm7
+    movdqa   [edx], xmm6
+    movdqa   [ecx], xmm4
+    pop ebx
+    mov esp, ebp
+    pop ebp
+    ret
 
 %endif
 
--- a/codec/common/x86/expand_picture.asm
+++ b/codec/common/x86/expand_picture.asm
@@ -77,280 +77,280 @@
 ;cccc|ceeeeeeeeeeeeeeeed|dddd
 ;cccc|ceeeeeeeeeeeeeeeed|dddd
 
-%macro mov_line_8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
+%macro mov_line_8x4_mmx     3   ; dst, stride, mm?
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+2*%2]
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+2*%2]
 %endmacro
 
-%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+%2]
+%macro mov_line_end8x4_mmx      3   ; dst, stride, mm?
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+2*%2]
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+%2]
 %endmacro
 
-%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
+%macro mov_line_16x4_sse2   4   ; dst, stride, xmm?, u/a
+    movdq%4 [%1], %3        ; top(bottom)_0
+    movdq%4 [%1+%2], %3     ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdq%4 [%1], %3        ; top(bottom)_2
+    movdq%4 [%1+%2], %3     ; top(bottom)_3
+    lea %1, [%1+2*%2]
 %endmacro
 
-%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+%2]
+%macro mov_line_end16x4_sse2    4   ; dst, stride, xmm?, u/a
+    movdq%4 [%1], %3        ; top(bottom)_0
+    movdq%4 [%1+%2], %3     ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdq%4 [%1], %3        ; top(bottom)_2
+    movdq%4 [%1+%2], %3     ; top(bottom)_3
+    lea %1, [%1+%2]
 %endmacro
 
-%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
+%macro mov_line_32x4_sse2   3   ; dst, stride, xmm?
+    movdqa [%1], %3         ; top(bottom)_0
+    movdqa [%1+16], %3      ; top(bottom)_0
+    movdqa [%1+%2], %3      ; top(bottom)_1
+    movdqa [%1+%2+16], %3       ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdqa [%1], %3         ; top(bottom)_2
+    movdqa [%1+16], %3      ; top(bottom)_2
+    movdqa [%1+%2], %3      ; top(bottom)_3
+    movdqa [%1+%2+16], %3       ; top(bottom)_3
+    lea %1, [%1+2*%2]
 %endmacro
 
-%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+%2]
+%macro mov_line_end32x4_sse2    3   ; dst, stride, xmm?
+    movdqa [%1], %3         ; top(bottom)_0
+    movdqa [%1+16], %3      ; top(bottom)_0
+    movdqa [%1+%2], %3      ; top(bottom)_1
+    movdqa [%1+%2+16], %3       ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdqa [%1], %3         ; top(bottom)_2
+    movdqa [%1+16], %3      ; top(bottom)_2
+    movdqa [%1+%2], %3      ; top(bottom)_3
+    movdqa [%1+%2+16], %3       ; top(bottom)_3
+    lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2  1   ; iPaddingSize [luma(32)/chroma(16)]
     ;r2 [width/16(8)]
     ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
     ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
 
-%if %1 == 32		; for luma
-	sar r2, 04h 	; width / 16(8) pixels
+%if %1 == 32        ; for luma
+    sar r2, 04h     ; width / 16(8) pixels
 .top_bottom_loops:
-	; top
-	movdqa xmm0, [r0]		; first line of picture pData
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_end16x4_sse2 r5, r1, xmm0, a
+    ; top
+    movdqa xmm0, [r0]       ; first line of picture pData
+    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_end16x4_sse2 r5, r1, xmm0, a
 
-	; bottom
-	movdqa xmm1, [r3] 		; last line of picture pData
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_end16x4_sse2 r4, r1, xmm1, a
+    ; bottom
+    movdqa xmm1, [r3]       ; last line of picture pData
+    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_end16x4_sse2 r4, r1, xmm1, a
 
-	lea r0, [r0+16]		; top pSrc
-	lea r5, [r5+16]		; top dst
-	lea r3, [r3+16]		; bottom pSrc
-	lea r4, [r4+16]		; bottom dst
-	neg r1 			; positive/negative stride need for next loop?
+    lea r0, [r0+16]     ; top pSrc
+    lea r5, [r5+16]     ; top dst
+    lea r3, [r3+16]     ; bottom pSrc
+    lea r4, [r4+16]     ; bottom dst
+    neg r1          ; positive/negative stride need for next loop?
 
-	dec r2
-	jnz near .top_bottom_loops
-%elif %1 == 16	; for chroma ??
-	mov r6, r2
-	sar r2, 04h 	; (width / 16) pixels
+    dec r2
+    jnz near .top_bottom_loops
+%elif %1 == 16  ; for chroma ??
+    mov r6, r2
+    sar r2, 04h     ; (width / 16) pixels
 .top_bottom_loops:
-	; top
-	movdqa xmm0, [r0]		; first line of picture pData
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_end16x4_sse2 r5, r1, xmm0, a
+    ; top
+    movdqa xmm0, [r0]       ; first line of picture pData
+    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_end16x4_sse2 r5, r1, xmm0, a
 
-	; bottom
-	movdqa xmm1, [r3] 		; last line of picture pData
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_end16x4_sse2 r4, r1, xmm1, a
+    ; bottom
+    movdqa xmm1, [r3]       ; last line of picture pData
+    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_end16x4_sse2 r4, r1, xmm1, a
 
-	lea r0, [r0+16]		; top pSrc
-	lea r5, [r5+16]		; top dst
-	lea r3, [r3+16]		; bottom pSrc
-	lea r4, [r4+16]		; bottom dst
-	neg r1 			; positive/negative stride need for next loop?
+    lea r0, [r0+16]     ; top pSrc
+    lea r5, [r5+16]     ; top dst
+    lea r3, [r3+16]     ; bottom pSrc
+    lea r4, [r4+16]     ; bottom dst
+    neg r1          ; positive/negative stride need for next loop?
 
-	dec r2
-	jnz near .top_bottom_loops
+    dec r2
+    jnz near .top_bottom_loops
 
-	; for remaining 8 bytes
-	and r6, 0fh		; any 8 bytes left?
-	test r6, r6
-	jz near .to_be_continued	; no left to exit here
+    ; for remaining 8 bytes
+    and r6, 0fh     ; any 8 bytes left?
+    test r6, r6
+    jz near .to_be_continued    ; no left to exit here
 
-	; top
-	movq mm0, [r0]		; remained 8 byte
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_end8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	; bottom
-	movq mm1, [r3]
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_end8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	WELSEMMS
+    ; top
+    movq mm0, [r0]      ; remained 8 byte
+    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
+    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
+    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
+    mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+    ; bottom
+    movq mm1, [r3]
+    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
+    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
+    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
+    mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+    WELSEMMS
 
 .to_be_continued:
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2  2   ; iPaddingSize [luma(32)/chroma(16)], u/a
     ;r6 [height]
     ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
     ;r3 [pSrc+(w-1)] r4[pSrc+w]
 
-%if %1 == 32		; for luma
+%if %1 == 32        ; for luma
 .left_right_loops:
-	; left
-	movzx r2d, byte [r0]		; pixel pData for left border
-	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r5], xmm0
-	movdqa [r5+16], xmm0
+    ; left
+    movzx r2d, byte [r0]        ; pixel pData for left border
+    SSE2_Copy16Times    xmm0, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdqa [r5], xmm0
+    movdqa [r5+16], xmm0
 
-	; right
-	movzx r2d, byte [r3]
-	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r4], xmm1
-	movdqa [r4+16], xmm1
+    ; right
+    movzx r2d, byte [r3]
+    SSE2_Copy16Times    xmm1, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdqa [r4], xmm1
+    movdqa [r4+16], xmm1
 
-	lea r0, [r0+r1]		; left pSrc
-	lea r5, [r5+r1]		; left dst
-	lea r3, [r3+r1]		; right pSrc
-	lea r4, [r4+r1]		; right dst
+    lea r0, [r0+r1]     ; left pSrc
+    lea r5, [r5+r1]     ; left dst
+    lea r3, [r3+r1]     ; right pSrc
+    lea r4, [r4+r1]     ; right dst
 
-	dec r6
-	jnz near .left_right_loops
-%elif %1 == 16	; for chroma ??
+    dec r6
+    jnz near .left_right_loops
+%elif %1 == 16  ; for chroma ??
 .left_right_loops:
-	; left
-	movzx r2d, byte [r0]		; pixel pData for left border
-	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r5], xmm0
+    ; left
+    movzx r2d, byte [r0]        ; pixel pData for left border
+    SSE2_Copy16Times    xmm0, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdqa [r5], xmm0
 
-	; right
-	movzx r2d, byte [r3]
-	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdq%2 [r4], xmm1								; might not be aligned 16 bytes in case chroma planes
+    ; right
+    movzx r2d, byte [r3]
+    SSE2_Copy16Times    xmm1, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdq%2 [r4], xmm1                              ; might not be aligned 16 bytes in case chroma planes
 
-	lea r0, [r0+r1]		; left pSrc
-	lea r5, [r5+r1]		; left dst
-	lea r3, [r3+r1]		; right pSrc
-	lea r4, [r4+r1]		; right dst
+    lea r0, [r0+r1]     ; left pSrc
+    lea r5, [r5+r1]     ; left dst
+    lea r3, [r3+r1]     ; right pSrc
+    lea r4, [r4+r1]     ; right dst
 
-	dec r6
-	jnz near .left_right_loops
+    dec r6
+    jnz near .left_right_loops
 %endif
 %endmacro
 
-%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
-	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+%macro exp_cross_sse2   2   ; iPaddingSize [luma(32)/chroma(16)], u/a
+    ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+    ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
     ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
-%if %1 == 32		; luma
-	; TL
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+%if %1 == 32        ; luma
+    ; TL
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r3, r1, xmm3    ; dst, stride, xmm?
 
-	; TR
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+    ; TR
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r4, r1, xmm4    ; dst, stride, xmm?
 
-	; BL
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+    ; BL
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r5, r1, xmm5    ; dst, stride, xmm?
 
-	; BR
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-%elif %1 == 16	; chroma
-	; TL
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+    ; BR
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r6, r1, xmm6    ; dst, stride, xmm?
+%elif %1 == 16  ; chroma
+    ; TL
+    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
+    mov_line_end16x4_sse2   r3, r1, xmm3, a ; dst, stride, xmm?
 
-	; TR
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2 r4, r1, xmm4, %2	; dst, stride, xmm?
+    ; TR
+    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
+    mov_line_end16x4_sse2 r4, r1, xmm4, %2  ; dst, stride, xmm?
 
-	; BL
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+    ; BL
+    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
+    mov_line_end16x4_sse2   r5, r1, xmm5, a ; dst, stride, xmm?
 
-	; BR
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+    ; BR
+    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
+    mov_line_end16x4_sse2   r6, r1, xmm6, %2    ; dst, stride, xmm?
 %endif
 %endmacro
 
 ;***********************************************************************----------------
-; void ExpandPictureLuma_sse2(	uint8_t *pDst,
-;									const int32_t iStride,
-;									const int32_t iWidth,
-;									const int32_t iHeight	);
+; void ExpandPictureLuma_sse2(  uint8_t *pDst,
+;                                   const int32_t iStride,
+;                                   const int32_t iWidth,
+;                                   const int32_t iHeight   );
 ;***********************************************************************----------------
 WELS_EXTERN ExpandPictureLuma_sse2
 
@@ -403,8 +403,8 @@
 
     exp_top_bottom_sse2 32
 
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for both left and right border
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
     pop r2
     pop r1
@@ -416,8 +416,8 @@
     lea r4,[r3+1]                           ;right border dst
 
     ;prepare for cross border data: top-rigth with xmm4
-     movzx r6d,byte [r3]                         ;top -rigth
-     SSE2_Copy16Times xmm4,r6d
+    movzx r6d,byte [r3]                         ;top -rigth
+    SSE2_Copy16Times xmm4,r6d
 
     neg r1   ;r1 = stride
 
@@ -438,8 +438,8 @@
     pop r1
     pop r0
 
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for cross border [top-left, top-right, bottom-left, bottom-right]
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 
     neg r1  ;r1 = -stride
@@ -472,13 +472,13 @@
     %assign push_num 0
 
 
-	ret
+    ret
 
 ;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
+; void ExpandPictureChromaAlign_sse2(   uint8_t *pDst,
+;                                       const int32_t iStride,
+;                                       const int32_t iWidth,
+;                                       const int32_t iHeight   );
 ;***********************************************************************----------------
 WELS_EXTERN ExpandPictureChromaAlign_sse2
 
@@ -531,8 +531,8 @@
 
     exp_top_bottom_sse2 16
 
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for both left and right border
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
     pop r2
     pop r1
@@ -557,7 +557,7 @@
     push r0
     push r1
     push r2
-	push r6
+    push r6
     exp_left_right_sse2 16,a
 
     pop r6
@@ -565,8 +565,8 @@
     pop r1
     pop r0
 
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for cross border [top-left, top-right, bottom-left, bottom-right]
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 
     neg r1  ;r1 = -stride
@@ -599,16 +599,16 @@
     %assign push_num 0
 
 
-	ret
+    ret
 
 ;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
+; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
+;                                       const int32_t iStride,
+;                                       const int32_t iWidth,
+;                                       const int32_t iHeight   );
 ;***********************************************************************----------------
 WELS_EXTERN ExpandPictureChromaUnalign_sse2
-	push r4
+    push r4
     push r5
     push r6
 
@@ -657,8 +657,8 @@
 
     exp_top_bottom_sse2 16
 
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for both left and right border
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
     pop r2
     pop r1
@@ -683,7 +683,7 @@
     push r0
     push r1
     push r2
-	push r6
+    push r6
     exp_left_right_sse2 16,u
 
     pop r6
@@ -691,8 +691,8 @@
     pop r1
     pop r0
 
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for cross border [top-left, top-right, bottom-left, bottom-right]
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 
     neg r1  ;r1 = -stride
@@ -725,4 +725,4 @@
     %assign push_num 0
 
 
-	ret
+    ret
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -36,9 +36,9 @@
 ;*
 ;*  History
 ;*      15/09/2009 Created
-;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*      12/28/2009 Modified with larger throughput
+;*      12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;*                 WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
 ;*
 ;*
 ;*********************************************************************************************/
@@ -56,174 +56,174 @@
 
 
 ;***********************************************************************
-; void WelsCopy16x16_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
+; void WelsCopy16x16_sse2(  uint8_t* Dst,
+;                           int32_t  iStrideD,
+;                           uint8_t* Src,
+;                           int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy16x16_sse2
 
-	push r4
-	push r5
-	%assign  push_num 2
+    push r4
+    push r5
+    %assign  push_num 2
     LOAD_4_PARA
     PUSH_XMM 8
 
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
+    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
 
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+2*r3]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+2*r3]
+    movdqa xmm7, [r2+r5]
+    lea r2, [r2+4*r3]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    lea r0, [r0+4*r1]
 
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+2*r3]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+2*r3]
+    movdqa xmm7, [r2+r5]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 ;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
+; void WelsCopy16x16NotAligned_sse2(    uint8_t* Dst,
+;                           int32_t  iStrideD,
+;                           uint8_t* Src,
+;                           int32_t  iStrideS )
 ;***********************************************************************
 ; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
 WELS_EXTERN WelsCopy16x16NotAligned_sse2
-	push r4
-	push r5
-	%assign  push_num 2
+    push r4
+    push r5
+    %assign  push_num 2
     LOAD_4_PARA
     PUSH_XMM 8
 
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
+    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
 
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
+    movdqu xmm0, [r2]
+    movdqu xmm1, [r2+r3]
+    movdqu xmm2, [r2+2*r3]
+    movdqu xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqu xmm4, [r2]
+    movdqu xmm5, [r2+r3]
+    movdqu xmm6, [r2+2*r3]
+    movdqu xmm7, [r2+r5]
+    lea r2, [r2+4*r3]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    lea r0, [r0+4*r1]
 
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
+    movdqu xmm0, [r2]
+    movdqu xmm1, [r2+r3]
+    movdqu xmm2, [r2+2*r3]
+    movdqu xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqu xmm4, [r2]
+    movdqu xmm5, [r2+r3]
+    movdqu xmm6, [r2+2*r3]
+    movdqu xmm7, [r2+r5]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 ; , 12/29/2011
 ;***********************************************************************
 ; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
+;                           int32_t  iStrideD,
+;                           uint8_t* Src,
+;                           int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy16x8NotAligned_sse2
-	push r4
-	push r5
-	%assign  push_num 2
+    push r4
+    push r5
+    %assign  push_num 2
     LOAD_4_PARA
     PUSH_XMM 8
 
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
+    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
 
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
+    movdqu xmm0, [r2]
+    movdqu xmm1, [r2+r3]
+    movdqu xmm2, [r2+2*r3]
+    movdqu xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqu xmm4, [r2]
+    movdqu xmm5, [r2+r3]
+    movdqu xmm6, [r2+2*r3]
+    movdqu xmm7, [r2+r5]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 
 ;***********************************************************************
@@ -233,62 +233,62 @@
 ;                       int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy8x16_mmx
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_4_PARA
 
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-	lea r2, [r2+2*r3]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
+    movq mm7, [r2+r3]
+    lea r2, [r2+2*r3]
 
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-	lea r0, [r0+2*r1]
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    lea r0, [r0+2*r1]
+    movq [r0], mm2
+    movq [r0+r1], mm3
+    lea r0, [r0+2*r1]
+    movq [r0], mm4
+    movq [r0+r1], mm5
+    lea r0, [r0+2*r1]
+    movq [r0], mm6
+    movq [r0+r1], mm7
+    lea r0, [r0+2*r1]
 
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
+    movq mm7, [r2+r3]
 
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    lea r0, [r0+2*r1]
+    movq [r0], mm2
+    movq [r0+r1], mm3
+    lea r0, [r0+2*r1]
+    movq [r0], mm4
+    movq [r0+r1], mm5
+    lea r0, [r0+2*r1]
+    movq [r0], mm6
+    movq [r0+r1], mm7
 
-	WELSEMMS
-	LOAD_4_PARA_POP
-	ret
+    WELSEMMS
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ; void WelsCopy8x8_mmx(  uint8_t* Dst,
@@ -297,48 +297,48 @@
 ;                        int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy8x8_mmx
-	push r4
-	%assign  push_num 1
+    push r4
+    %assign  push_num 1
     LOAD_4_PARA
-	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
+    lea r4, [r3+2*r3]   ;edx, [ebx+2*ebx]
 
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
+    ; to prefetch next loop
+    prefetchnta [r2+2*r3]
+    prefetchnta [r2+r4]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    ; to prefetch next loop
+    prefetchnta [r2+2*r3]
+    prefetchnta [r2+r4]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    ; to prefetch next loop
+    prefetchnta [r2+2*r3]
+    prefetchnta [r2+r4]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
+    movq mm7, [r2+r3]
 
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    lea r0, [r0+2*r1]
+    movq [r0], mm2
+    movq [r0+r1], mm3
+    lea r0, [r0+2*r1]
+    movq [r0], mm4
+    movq [r0+r1], mm5
+    lea r0, [r0+2*r1]
+    movq [r0], mm6
+    movq [r0+r1], mm7
 
-	WELSEMMS
-	LOAD_4_PARA_POP
-	pop r4
-	ret
+    WELSEMMS
+    LOAD_4_PARA_POP
+    pop r4
+    ret
 
 ; (dunhuang@cisco), 12/21/2011
 ;***********************************************************************
@@ -349,13 +349,13 @@
     %assign  push_num 0
     LOAD_2_PARA
 
-	movd xmm0, r1d	; _mv
-	pshufd xmm1, xmm0, $00
-	movdqa [r0     ], xmm1
-	movdqa [r0+0x10], xmm1
-	movdqa [r0+0x20], xmm1
-	movdqa [r0+0x30], xmm1
-	ret
+    movd xmm0, r1d  ; _mv
+    pshufd xmm1, xmm0, $00
+    movdqa [r0     ], xmm1
+    movdqa [r0+0x10], xmm1
+    movdqa [r0+0x20], xmm1
+    movdqa [r0+0x30], xmm1
+    ret
 
 ;*******************************************************************************
 ; Macros and other preprocessor constants
@@ -381,14 +381,14 @@
     %assign  push_num 0
     LOAD_7_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    SIGN_EXTENSION  r6, r6d
 
 ALIGN 4
 .height_loop:
-	movd        mm0, [r4]
+    movd        mm0, [r4]
     pavgb       mm0, [r2]
     movd        [r0], mm0
 
@@ -398,8 +398,8 @@
     lea         r4, [r4+r5]
     jne         .height_loop
 
-	WELSEMMS
-	LOAD_7_PARA_POP
+    WELSEMMS
+    LOAD_7_PARA_POP
     ret
 
 
@@ -413,29 +413,29 @@
     %assign  push_num 0
     LOAD_7_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    SIGN_EXTENSION  r6, r6d
 
 ALIGN 4
 .height_loop:
-	movq        mm0, [r2]
+    movq        mm0, [r2]
     pavgb       mm0, [r4]
     movq        [r0], mm0
     movq        mm0, [r2+r3]
     pavgb       mm0, [r4+r5]
-    movq		[r0+r1], mm0
+    movq        [r0+r1], mm0
 
-    lea			r2,  [r2+2*r3]
-    lea			r4,  [r4+2*r5]
-    lea			r0,  [r0+2*r1]
+    lea         r2,  [r2+2*r3]
+    lea         r4,  [r4+2*r5]
+    lea         r0,  [r0+2*r1]
 
     sub         r6, 2
     jnz         .height_loop
 
-	WELSEMMS
-	LOAD_7_PARA_POP
+    WELSEMMS
+    LOAD_7_PARA_POP
     ret
 
 
@@ -450,46 +450,46 @@
 
     %assign  push_num 0
     LOAD_7_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    SIGN_EXTENSION  r6, r6d
 ALIGN 4
 .height_loop:
-	movdqu      xmm0, [r2]
-	movdqu	    xmm1, [r4]
-	pavgb	    xmm0, xmm1
-	;pavgb       xmm0, [r4]
+    movdqu      xmm0, [r2]
+    movdqu      xmm1, [r4]
+    pavgb       xmm0, xmm1
+    ;pavgb       xmm0, [r4]
     movdqu      [r0], xmm0
 
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
+    movdqu      xmm0, [r2+r3]
+    movdqu      xmm1, [r4+r5]
+    pavgb       xmm0, xmm1
     movdqu      [r0+r1], xmm0
 
-	movdqu      xmm0, [r2+2*r3]
-	movdqu       xmm1, [r4+2*r5]
-	pavgb	    xmm0, xmm1
+    movdqu      xmm0, [r2+2*r3]
+    movdqu       xmm1, [r4+2*r5]
+    pavgb       xmm0, xmm1
     movdqu      [r0+2*r1], xmm0
 
     lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
+    lea         r4, [r4+2*r5]
+    lea         r0, [r0+2*r1]
 
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
+    movdqu      xmm0, [r2+r3]
+    movdqu      xmm1, [r4+r5]
+    pavgb       xmm0, xmm1
     movdqu      [r0+r1], xmm0
 
     lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
+    lea         r4, [r4+2*r5]
+    lea         r0, [r0+2*r1]
 
     sub         r6, 4
     jne         .height_loop
 
-	WELSEMMS
-	LOAD_7_PARA_POP
+    WELSEMMS
+    LOAD_7_PARA_POP
     ret
 
 ;*******************************************************************************
@@ -497,26 +497,26 @@
 ;                          uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
 WELS_EXTERN McCopyWidthEq4_mmx
-    push	r5
+    push    r5
     %assign  push_num 1
     LOAD_5_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 
 ALIGN 4
 .height_loop:
-	mov r5d, [r0]
-	mov [r2], r5d
+    mov r5d, [r0]
+    mov [r2], r5d
 
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-	WELSEMMS
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .height_loop
+    WELSEMMS
     LOAD_5_PARA_POP
-    pop	   r5
+    pop    r5
     ret
 
 ;*******************************************************************************
@@ -527,21 +527,21 @@
     %assign  push_num 0
     LOAD_5_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 
 ALIGN 4
 .height_loop:
-	movq mm0, [r0]
-	movq [r2], mm0
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
+    movq mm0, [r0]
+    movq [r2], mm0
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .height_loop
 
-	WELSEMMS
-	LOAD_5_PARA_POP
+    WELSEMMS
+    LOAD_5_PARA_POP
     ret
 
 
@@ -550,32 +550,32 @@
 ;*******************************************************************************
 ;read unaligned memory
 %macro SSE_READ_UNA 2
-	movq	%1, [%2]
-	movhps	%1,	[%2+8]
+    movq    %1, [%2]
+    movhps  %1, [%2+8]
 %endmacro
 
 ;write unaligned memory
 %macro SSE_WRITE_UNA 2
-	movq	[%1],	%2
-	movhps	[%1+8], %2
+    movq    [%1],   %2
+    movhps  [%1+8], %2
 %endmacro
 WELS_EXTERN McCopyWidthEq16_sse2
     %assign  push_num 0
     LOAD_5_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 ALIGN 4
 .height_loop:
-    SSE_READ_UNA	xmm0, r0
-    SSE_READ_UNA	xmm1, r0+r1
-    SSE_WRITE_UNA	r2, xmm0
-    SSE_WRITE_UNA	r2+r3, xmm1
+    SSE_READ_UNA    xmm0, r0
+    SSE_READ_UNA    xmm1, r0+r1
+    SSE_WRITE_UNA   r2, xmm0
+    SSE_WRITE_UNA   r2+r3, xmm1
 
-	sub		r4,	2
+    sub     r4, 2
     lea     r0, [r0+r1*2]
     lea     r2, [r2+r3*2]
     jnz     .height_loop
 
-	LOAD_5_PARA_POP
+    LOAD_5_PARA_POP
     ret
--- a/codec/common/x86/mc_chroma.asm
+++ b/codec/common/x86/mc_chroma.asm
@@ -53,10 +53,10 @@
 
 ALIGN 16
 h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
+    dw 32,32,32,32,32,32,32,32
 ALIGN 16
 h264_d0x20_mmx:
-	dw 32,32,32,32
+    dw 32,32,32,32
 
 
 ;=============================================================================
@@ -67,152 +67,152 @@
 
 ;*******************************************************************************
 ; void McChromaWidthEq4_mmx( const uint8_t *src,
-;							int32_t iSrcStride,
-;							uint8_t *pDst,
-;							int32_t iDstStride,
-;							const uint8_t *pABCD,
-;							int32_t iHeigh );
+;                           int32_t iSrcStride,
+;                           uint8_t *pDst,
+;                           int32_t iDstStride,
+;                           const uint8_t *pABCD,
+;                           int32_t iHeigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq4_mmx
-	%assign  push_num 0
-	LOAD_6_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
+    %assign  push_num 0
+    LOAD_6_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
 
-	movd mm3, [r4];	[eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3
-	punpckhwd mm4, mm4
+    movd mm3, [r4]; [eax]
+    WELS_Zero mm7
+    punpcklbw mm3, mm3
+    movq      mm4, mm3
+    punpcklwd mm3, mm3
+    punpckhwd mm4, mm4
 
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
+    movq      mm5, mm3
+    punpcklbw mm3, mm7
+    punpckhbw mm5, mm7
 
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
+    movq      mm6, mm4
+    punpcklbw mm4, mm7
+    punpckhbw mm6, mm7
 
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movd mm0, [r0]
-	movd mm1, [r0+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
+    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+    movd mm0, [r0]
+    movd mm1, [r0+1]
+    punpcklbw mm0, mm7
+    punpcklbw mm1, mm7
 .xloop:
 
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
+    pmullw mm0, mm3
+    pmullw mm1, mm5
+    paddw  mm0, mm1
 
-	movd  mm1, [r4]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
+    movd  mm1, [r4]
+    punpcklbw mm1, mm7
+    movq mm2, mm1
+    pmullw mm1, mm4
+    paddw mm0, mm1
 
-	movd mm1, [r4+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
+    movd mm1, [r4+1]
+    punpcklbw mm1, mm7
+    movq mm7, mm1
+    pmullw mm1,mm6
+    paddw mm0, mm1
+    movq mm1,mm7
 
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
+    paddw mm0, [h264_d0x20_mmx]
+    psrlw mm0, 6
 
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [r2], mm0
+    WELS_Zero mm7
+    packuswb mm0, mm7
+    movd [r2], mm0
 
-	movq mm0, mm2
+    movq mm0, mm2
 
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
+    lea r2, [r2 + r3]
+    lea r4, [r4 + r1]
 
-	dec r5
-	jnz near .xloop
-	WELSEMMS
-	LOAD_6_PARA_POP
-	ret
+    dec r5
+    jnz near .xloop
+    WELSEMMS
+    LOAD_6_PARA_POP
+    ret
 
 
 ;*******************************************************************************
 ; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
-;						int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						const uint8_t *pABCD,
-;						int32_t iheigh );
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;                       const uint8_t *pABCD,
+;                       int32_t iheigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq8_sse2
-	%assign  push_num 0
-	LOAD_6_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
 
-	movd xmm3, [r4]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
+    movd xmm3, [r4]
+    WELS_Zero xmm7
+    punpcklbw  xmm3, xmm3
+    punpcklwd  xmm3, xmm3
 
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
+    movdqa     xmm4, xmm3
+    punpckldq  xmm3, xmm3
+    punpckhdq  xmm4, xmm4
+    movdqa     xmm5, xmm3
+    movdqa     xmm6, xmm4
 
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
+    punpcklbw  xmm3, xmm7
+    punpckhbw  xmm5, xmm7
+    punpcklbw  xmm4, xmm7
+    punpckhbw  xmm6, xmm7
 
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movq xmm0, [r0]
-	movq xmm1, [r0+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
+    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+    movq xmm0, [r0]
+    movq xmm1, [r0+1]
+    punpcklbw xmm0, xmm7
+    punpcklbw xmm1, xmm7
 .xloop:
 
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
+    pmullw xmm0, xmm3
+    pmullw xmm1, xmm5
+    paddw  xmm0, xmm1
 
-	movq  xmm1, [r4]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
+    movq  xmm1, [r4]
+    punpcklbw xmm1, xmm7
+    movdqa xmm2, xmm1
+    pmullw xmm1, xmm4
+    paddw xmm0, xmm1
 
-	movq xmm1, [r4+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
+    movq xmm1, [r4+1]
+    punpcklbw xmm1, xmm7
+    movdqa xmm7, xmm1
+    pmullw xmm1, xmm6
+    paddw xmm0, xmm1
+    movdqa xmm1,xmm7
 
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
+    paddw xmm0, [h264_d0x20_sse2]
+    psrlw xmm0, 6
 
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
+    WELS_Zero xmm7
+    packuswb xmm0, xmm7
+    movq [r2], xmm0
 
-	movdqa xmm0, xmm2
+    movdqa xmm0, xmm2
 
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
+    lea r2, [r2 + r3]
+    lea r4, [r4 + r1]
 
-	dec r5
-	jnz near .xloop
+    dec r5
+    jnz near .xloop
 
-	POP_XMM
-	LOAD_6_PARA_POP
+    POP_XMM
+    LOAD_6_PARA_POP
 
-	ret
+    ret
 
 
 
@@ -219,19 +219,19 @@
 
 ;***********************************************************************
 ; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
-;						 int32_t iSrcStride,
+;                        int32_t iSrcStride,
 ;                        uint8_t *pDst,
 ;                        int32_t iDstStride,
 ;                        const uint8_t *pABCD,
-;					     int32_t iHeigh);
+;                        int32_t iHeigh);
 ;***********************************************************************
 WELS_EXTERN McChromaWidthEq8_ssse3
-	%assign  push_num 0
-	LOAD_6_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
 
     pxor      xmm7, xmm7
     movd   xmm5, [r4]
@@ -243,27 +243,27 @@
 
     sub r2, r3 ;sub esi, edi
     sub r2, r3
-	movdqa xmm7, [h264_d0x20_sse2]
+    movdqa xmm7, [h264_d0x20_sse2]
 
-	movdqu xmm0, [r0]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
+    movdqu xmm0, [r0]
+    movdqa xmm1, xmm0
+    psrldq xmm1, 1
+    punpcklbw xmm0, xmm1
 
 .hloop_chroma:
-	lea	r2, [r2+2*r3]
+    lea r2, [r2+2*r3]
 
-	movdqu xmm2, [r0+r1]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
+    movdqu xmm2, [r0+r1]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm4, xmm2
 
     pmaddubsw  xmm0, xmm5
     pmaddubsw  xmm2, xmm6
     paddw      xmm0, xmm2
     paddw      xmm0, xmm7
-	psrlw      xmm0, 6
+    psrlw      xmm0, 6
     packuswb   xmm0, xmm0
     movq       [r2],xmm0
 
@@ -278,16 +278,16 @@
     pmaddubsw  xmm2, xmm6
     paddw      xmm4, xmm2
     paddw      xmm4, xmm7
-	psrlw      xmm4, 6
+    psrlw      xmm4, 6
     packuswb   xmm4, xmm4
     movq       [r2+r3],xmm4
 
-	sub r5, 2
-	jnz .hloop_chroma
+    sub r5, 2
+    jnz .hloop_chroma
 
-	POP_XMM
-	LOAD_6_PARA_POP
+    POP_XMM
+    LOAD_6_PARA_POP
 
-	ret
+    ret
 
 
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -52,13 +52,13 @@
 
 ALIGN 16
 h264_w0x10:
-	dw 16, 16, 16, 16
+    dw 16, 16, 16, 16
 ALIGN 16
 h264_w0x10_1:
-	dw 16, 16, 16, 16, 16, 16, 16, 16
+    dw 16, 16, 16, 16, 16, 16, 16, 16
 ALIGN 16
 h264_mc_hc_32:
-	dw 32, 32, 32, 32, 32, 32, 32, 32
+    dw 32, 32, 32, 32, 32, 32, 32, 32
 
 
 ;*******************************************************************************
@@ -72,55 +72,55 @@
 ;*******************************************************************************
 ; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
 ;                       int iSrcStride,
-;						uint8_t *pDst,
-;						int iDstStride,
-;						int iHeight)
+;                       uint8_t *pDst,
+;                       int iDstStride,
+;                       int iHeight)
 ;*******************************************************************************
 WELS_EXTERN McHorVer20WidthEq4_mmx
     %assign  push_num 0
     LOAD_5_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 
-	sub r0, 2
-	WELS_Zero mm7
-	movq mm6, [h264_w0x10]
+    sub r0, 2
+    WELS_Zero mm7
+    movq mm6, [h264_w0x10]
 .height_loop:
-	movd mm0, [r0]
-	punpcklbw mm0, mm7
-	movd mm1, [r0+5]
-	punpcklbw mm1, mm7
-	movd mm2, [r0+1]
-	punpcklbw mm2, mm7
-	movd mm3, [r0+4]
-	punpcklbw mm3, mm7
-	movd mm4, [r0+2]
-	punpcklbw mm4, mm7
-	movd mm5, [r0+3]
-	punpcklbw mm5, mm7
+    movd mm0, [r0]
+    punpcklbw mm0, mm7
+    movd mm1, [r0+5]
+    punpcklbw mm1, mm7
+    movd mm2, [r0+1]
+    punpcklbw mm2, mm7
+    movd mm3, [r0+4]
+    punpcklbw mm3, mm7
+    movd mm4, [r0+2]
+    punpcklbw mm4, mm7
+    movd mm5, [r0+3]
+    punpcklbw mm5, mm7
 
-	paddw mm2, mm3
-	paddw mm4, mm5
-	psllw mm4, 2
-	psubw mm4, mm2
-	paddw mm0, mm1
-	paddw mm0, mm4
-	psllw mm4, 2
-	paddw mm0, mm4
-	paddw mm0, mm6
-	psraw mm0, 5
-	packuswb mm0, mm7
-	movd [r2], mm0
+    paddw mm2, mm3
+    paddw mm4, mm5
+    psllw mm4, 2
+    psubw mm4, mm2
+    paddw mm0, mm1
+    paddw mm0, mm4
+    psllw mm4, 2
+    paddw mm0, mm4
+    paddw mm0, mm6
+    psraw mm0, 5
+    packuswb mm0, mm7
+    movd [r2], mm0
 
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .height_loop
 
-	WELSEMMS
-	LOAD_5_PARA_POP
-	ret
+    WELSEMMS
+    LOAD_5_PARA_POP
+    ret
 
 ;*******************************************************************************
 ; Macros and other preprocessor constants
@@ -128,26 +128,26 @@
 
 
 %macro SSE_LOAD_8P 3
-	movq %1, %3
-	punpcklbw %1, %2
+    movq %1, %3
+    punpcklbw %1, %2
 %endmacro
 
 %macro FILTER_HV_W8 9
-	paddw	%1, %6
-	movdqa	%8, %3
-	movdqa	%7, %2
-	paddw	%1, [h264_w0x10_1]
-	paddw	%8, %4
-	paddw	%7, %5
-	psllw	%8, 2
-	psubw	%8, %7
-	paddw	%1, %8
-	psllw	%8, 2
-	paddw	%1, %8
-	psraw   %1, 5
-	WELS_Zero %8
-	packuswb %1, %8
-	movq    %9, %1
+    paddw   %1, %6
+    movdqa  %8, %3
+    movdqa  %7, %2
+    paddw   %1, [h264_w0x10_1]
+    paddw   %8, %4
+    paddw   %7, %5
+    psllw   %8, 2
+    psubw   %8, %7
+    paddw   %1, %8
+    psllw   %8, 2
+    paddw   %1, %8
+    psraw   %1, 5
+    WELS_Zero %8
+    packuswb %1, %8
+    movq    %9, %1
 %endmacro
 
 ;*******************************************************************************
@@ -159,192 +159,192 @@
 ;***********************************************************************
 ; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
 ;                       int16_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride
-;						int32_t iHeight
+;                       uint8_t *pDst,
+;                       int32_t iDstStride
+;                       int32_t iHeight
 ;                       )
 ;***********************************************************************
 WELS_EXTERN McHorVer22Width8HorFirst_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	pxor xmm7, xmm7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    pxor xmm7, xmm7
 
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
+    sub r0, r1              ;;;;;;;;need more 5 lines.
+    sub r0, r1
 
 .yloop_width_8:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    movdqa [r2], xmm0
 
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .yloop_width_8
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .yloop_width_8
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;*******************************************************************************
 ; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
 ;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
+;                                               uint8_t *pDst,
+;                                               int iDstStride,
+;                                               int iHeight,
 ;                      );
 ;*******************************************************************************
 WELS_EXTERN McHorVer20WidthEq8_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	lea r0, [r0-2]            ;pSrc -= 2;
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    lea r0, [r0-2]            ;pSrc -= 2;
 
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
+    pxor xmm7, xmm7
+    movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, xmm6
+    psraw xmm0, 5
 
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
+    packuswb xmm0, xmm7
+    movq [r2], xmm0
 
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
+    lea r2, [r2+r3]
+    lea r0, [r0+r1]
+    dec r4
+    jnz near .y_loop
 
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;*******************************************************************************
 ; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
 ;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
+;                                               uint8_t *pDst,
+;                                               int iDstStride,
+;                                               int iHeight,
 ;                      );
 ;*******************************************************************************
 WELS_EXTERN McHorVer20WidthEq16_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	lea r0, [r0-2]            ;pSrc -= 2;
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    lea r0, [r0-2]            ;pSrc -= 2;
 
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
+    pxor xmm7, xmm7
+    movdqa xmm6, [h264_w0x10_1]
 .y_loop:
 
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, xmm6
+    psraw xmm0, 5
+    packuswb xmm0, xmm7
+    movq [r2], xmm0
 
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0+8]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5+8]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1+8]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4+8]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2+8]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3+8]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2+8], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, xmm6
+    psraw xmm0, 5
+    packuswb xmm0, xmm7
+    movq [r2+8], xmm0
 
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
+    lea r2, [r2+r3]
+    lea r0, [r0+r1]
+    dec r4
+    jnz near .y_loop
 
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 
 ;*******************************************************************************
@@ -355,81 +355,81 @@
 ;                       int iHeight )
 ;*******************************************************************************
 WELS_EXTERN McHorVer02WidthEq8_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	sub r0, r1
-	sub r0, r1
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    sub r0, r1
+    sub r0, r1
 
-	WELS_Zero xmm7
+    WELS_Zero xmm7
 
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+    SSE_LOAD_8P xmm0, xmm7, [r0]
+    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm7, [r0]
+    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm7, [r0]
+    SSE_LOAD_8P xmm5, xmm7, [r0+r1]
 
 .start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r4
-	jz near .xx_exit
+    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm6, xmm7, [r0]
+    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r4
-	jz near .xx_exit
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm0, xmm1, [r0]
+    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r4
-	jz near .xx_exit
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm3, [r0]
+    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r4
-	jz near .xx_exit
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm5, [r0]
+    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+    jmp near .start
 
 .xx_exit:
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;***********************************************************************
 ; Code
@@ -440,725 +440,725 @@
 
 
 ;***********************************************************************
-; void McHorVer02Height9Or17_sse2(	const uint8_t *pSrc,
+; void McHorVer02Height9Or17_sse2(  const uint8_t *pSrc,
 ;                       int32_t iSrcStride,
 ;                       uint8_t *pDst,
 ;                       int32_t iDstStride,
-;						int32_t iWidth,
+;                       int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
 WELS_EXTERN McHorVer02Height9Or17_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
 
 %ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
+    push r12
+    push r13
+    push r14
+    mov  r12, r0
+    mov  r13, r2
+    mov  r14, r5
 %endif
 
-	shr r4, 3
-	sub r0, r1
-	sub r0, r1
+    shr r4, 3
+    sub r0, r1
+    sub r0, r1
 
 .xloop:
-	WELS_Zero xmm7
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+    WELS_Zero xmm7
+    SSE_LOAD_8P xmm0, xmm7, [r0]
+    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm7, [r0]
+    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm7, [r0]
+    SSE_LOAD_8P xmm5, xmm7, [r0+r1]
 
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	movdqa xmm0,xmm1
-	movdqa xmm1,xmm2
-	movdqa xmm2,xmm3
-	movdqa xmm3,xmm4
-	movdqa xmm4,xmm5
-	movdqa xmm5,xmm6
-	add r2, r3
-	sub r0, r1
+    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm6, xmm7, [r0]
+    movdqa xmm0,xmm1
+    movdqa xmm1,xmm2
+    movdqa xmm2,xmm3
+    movdqa xmm3,xmm4
+    movdqa xmm4,xmm5
+    movdqa xmm5,xmm6
+    add r2, r3
+    sub r0, r1
 
 .start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
+    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm6, xmm7, [r0]
+    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm0, xmm1, [r0]
+    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm3, [r0]
+    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm5, [r0]
+    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+    jmp near .start
 
 .x_loop_dec:
-	dec r4
-	jz  near .xx_exit
+    dec r4
+    jz  near .xx_exit
 %ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
+    mov r0, arg1
+    mov r2, arg3
+    mov r5, arg6
 %else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
+    mov r0, r12
+    mov r2, r13
+    mov r5, r14
 %endif
-	sub r0, r1
-	sub r0, r1
-	add r0, 8
-	add r2, 8
-	jmp near .xloop
+    sub r0, r1
+    sub r0, r1
+    add r0, 8
+    add r2, 8
+    jmp near .xloop
 
 .xx_exit:
 %ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
+    pop r14
+    pop r13
+    pop r12
 %endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 ;***********************************************************************
-; void McHorVer20Width9Or17_sse2(		const uint8_t *pSrc,
+; void McHorVer20Width9Or17_sse2(       const uint8_t *pSrc,
 ;                       int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						int32_t iWidth,
-;						int32_t iHeight
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;                       int32_t iWidth,
+;                       int32_t iHeight
 ;                      );
 ;***********************************************************************
 WELS_EXTERN McHorVer20Width9Or17_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-	sub r0, 2
-	pxor xmm7, xmm7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub r0, 2
+    pxor xmm7, xmm7
 
-	cmp r4, 9
-	jne near .width_17
+    cmp r4, 9
+    jne near .width_17
 
 .yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    paddw xmm0, [h264_w0x10_1]
+    psraw  xmm0, 5
+    packuswb xmm0, xmm0
+    movd [r2], xmm0
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+1], xmm2
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    paddw xmm2, [h264_w0x10_1]
+    psraw  xmm2, 5
+    packuswb xmm2, xmm2
+    movq [r2+1], xmm2
 
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_9
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 .width_17:
 .yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movq [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, [h264_w0x10_1]
+    psraw  xmm0, 5
+    packuswb xmm0, xmm0
+    movq [r2], xmm0
 
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0+8]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5+8]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1+8]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4+8]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2+8]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3+8]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2+8], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    paddw xmm0, [h264_w0x10_1]
+    psraw  xmm0, 5
+    packuswb xmm0, xmm0
+    movd [r2+8], xmm0
 
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6+8]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+9], xmm2
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    paddw xmm2, [h264_w0x10_1]
+    psraw  xmm2, 5
+    packuswb xmm2, xmm2
+    movq [r2+9], xmm2
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_17
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
-;							(const uint8_t *pSrc,
-;							int32_t iSrcStride,
-;							uint8_t * pTap,
-;							int32_t iTapStride,
-;							int32_t iWidth,int32_t iHeight);
+;                           (const uint8_t *pSrc,
+;                           int32_t iSrcStride,
+;                           uint8_t * pTap,
+;                           int32_t iTapStride,
+;                           int32_t iWidth,int32_t iHeight);
 ;***********************************************************************
 WELS_EXTERN McHorVer22HorFirst_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-	pxor xmm7, xmm7
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    pxor xmm7, xmm7
+    sub r0, r1              ;;;;;;;;need more 5 lines.
+    sub r0, r1
 
-	cmp r4, 9
-	jne near .width_17
+    cmp r4, 9
+    jne near .width_17
 
 .yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    movd [r2], xmm0
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+2], xmm2
-	movhps [r2+2+8], xmm2
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    movq [r2+2], xmm2
+    movhps [r2+2+8], xmm2
 
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_9
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 .width_17:
 .yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    movdqa [r2], xmm0
 
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0+8]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5+8]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1+8]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4+8]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2+8]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3+8]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2+16], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    movd [r2+16], xmm0
 
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6+8]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+18], xmm2
-	movhps [r2+18+8], xmm2
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    movq [r2+18], xmm2
+    movhps [r2+18+8], xmm2
 
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_17
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 %macro FILTER_VER 9
-	paddw  %1, %6
-	movdqa %7, %2
-	movdqa %8, %3
+    paddw  %1, %6
+    movdqa %7, %2
+    movdqa %8, %3
 
 
-	paddw %7, %5
-	paddw %8, %4
+    paddw %7, %5
+    paddw %8, %4
 
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %1, %8
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %8, %1
-	paddw  %8, [h264_mc_hc_32]
-	psraw   %8, 6
-	packuswb %8, %8
-	movq %9, %8
+    psubw  %1, %7
+    psraw   %1, 2
+    paddw  %1, %8
+    psubw  %1, %7
+    psraw   %1, 2
+    paddw  %8, %1
+    paddw  %8, [h264_mc_hc_32]
+    psraw   %8, 6
+    packuswb %8, %8
+    movq %9, %8
 %endmacro
 ;***********************************************************************
 ;void McHorVer22Width8VerLastAlign_sse2(
-;											const uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
+;                                           const uint8_t *pTap,
+;                                           int32_t iTapStride,
+;                                           uint8_t * pDst,
+;                                           int32_t iDstStride,
+;                                           int32_t iWidth,
+;                                           int32_t iHeight);
 ;***********************************************************************
 
 WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
 %ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
+    push r12
+    push r13
+    push r14
+    mov  r12, r0
+    mov  r13, r2
+    mov  r14, r5
 %endif
 
-	shr r4, 3
+    shr r4, 3
 
 .width_loop:
-	movdqa xmm0, [r0]
-	movdqa xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	movdqa xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	movdqa xmm5, [r0+r1]
+    movdqa xmm0, [r0]
+    movdqa xmm1, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    movdqa xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    movdqa xmm5, [r0+r1]
 
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
+    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    lea r0, [r0+2*r1]
+    movdqa xmm6, [r0]
 
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
+    movdqa xmm0, xmm1
+    movdqa xmm1, xmm2
+    movdqa xmm2, xmm3
+    movdqa xmm3, xmm4
+    movdqa xmm4, xmm5
+    movdqa xmm5, xmm6
 
-	add r2, r3
-	sub r0, r1
+    add r2, r3
+    sub r0, r1
 
 .start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
+    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm6, [r0]
+    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqa xmm7, [r0+r1]
+    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm0, [r0]
+    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqa xmm1, [r0+r1]
+    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqa xmm3, [r0+r1]
+    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm5, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    movdqa xmm5, [r0+r1]
+    jmp near .start
 
 .x_loop_dec:
-	dec r4
-	jz near .exit
+    dec r4
+    jz near .exit
 %ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
+    mov r0, arg1
+    mov r2, arg3
+    mov r5, arg6
 %else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
+    mov r0, r12
+    mov r2, r13
+    mov r5, r14
 %endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
+    add r0, 16
+    add r2, 8
+    jmp .width_loop
 
 .exit:
 %ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
+    pop r14
+    pop r13
+    pop r12
 %endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 ;***********************************************************************
 ;void McHorVer22Width8VerLastUnAlign_sse2(
-;											const uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
+;                                           const uint8_t *pTap,
+;                                           int32_t iTapStride,
+;                                           uint8_t * pDst,
+;                                           int32_t iDstStride,
+;                                           int32_t iWidth,
+;                                           int32_t iHeight);
 ;***********************************************************************
 
 WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
 %ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
+    push r12
+    push r13
+    push r14
+    mov  r12, r0
+    mov  r13, r2
+    mov  r14, r5
 %endif
-	shr r4, 3
+    shr r4, 3
 
 .width_loop:
-	movdqu xmm0, [r0]
-	movdqu xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	movdqu xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	movdqu xmm5, [r0+r1]
+    movdqu xmm0, [r0]
+    movdqu xmm1, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    movdqu xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqu xmm4, [r0]
+    movdqu xmm5, [r0+r1]
 
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
+    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    lea r0, [r0+2*r1]
+    movdqu xmm6, [r0]
 
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
+    movdqa xmm0, xmm1
+    movdqa xmm1, xmm2
+    movdqa xmm2, xmm3
+    movdqa xmm3, xmm4
+    movdqa xmm4, xmm5
+    movdqa xmm5, xmm6
 
-	add r2, r3
-	sub r0, r1
+    add r2, r3
+    sub r0, r1
 
 .start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
+    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm6, [r0]
+    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqu xmm7, [r0+r1]
+    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm0, [r0]
+    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqu xmm1, [r0+r1]
+    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqu xmm3, [r0+r1]
+    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm4, [r0]
+    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm5, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    movdqu xmm5, [r0+r1]
+    jmp near .start
 
 .x_loop_dec:
-	dec r4
-	jz near .exit
+    dec r4
+    jz near .exit
 %ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
+    mov r0, arg1
+    mov r2, arg3
+    mov r5, arg6
 %else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
+    mov r0, r12
+    mov r2, r13
+    mov r5, r14
 %endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
+    add r0, 16
+    add r2, 8
+    jmp .width_loop
 
 .exit:
 %ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
+    pop r14
+    pop r13
+    pop r12
 %endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -77,77 +77,77 @@
 ;
 ;***********************************************************************
 %macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
+    pxor %1, %1
+    pcmpeqw %2, %2
+    psubw %1, %2
 %endmacro
 
 %macro SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 8
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    paddusw     %1, %2
 %endmacro
 
 %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
+    SSE2_SumSub %1, %2, %5
+    SSE2_SumSub %3, %4, %5
+    SSE2_SumSub %2, %4, %5
+    SSE2_SumSub %1, %3, %5
 %endmacro
 
 %macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
+    WELS_AbsW %1, %3
+    WELS_AbsW %2, %3
+    WELS_AbsW %4, %6
+    WELS_AbsW %5, %6
+    paddusw       %1, %2
+    paddusw       %4, %5
+    paddusw       %7, %1
+    paddusw       %7, %4
 %endmacro
 
 %macro SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
+    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
+    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
+    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
+    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
+    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
+    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
 %endmacro
 
 %macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+    lea                 r0, [r0+2*r1]
+    lea                 r2, [r2+2*r3]
+    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
 
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 
-	lea					r0,    [r0+2*r1]
-    lea					r2,    [r2+2*r3]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+    lea                 r0,    [r0+2*r1]
+    lea                 r2,    [r2+2*r3]
+    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+    lea                 r0, [r0+2*r1]
+    lea                 r2, [r2+2*r3]
+    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
 
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 %endmacro
 
 ;***********************************************************************
@@ -156,11 +156,11 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
     movd      xmm0, [r0]
     movd      xmm1, [r0+r1]
     lea       r0 , [r0+2*r1]
@@ -199,14 +199,14 @@
     punpcklwd      xmm0, xmm4
     punpckhwd      xmm4, xmm2
 
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+    SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+    SSE2_XSawp     qdq, xmm0, xmm3, xmm5
 
     movdqa         xmm7, xmm0
     paddw          xmm0, xmm5
     psubw          xmm7, xmm5
 
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+    SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
 
     movdqa         xmm2, xmm0
     paddw          xmm0, xmm1
@@ -214,15 +214,15 @@
 
     WELS_AbsW  xmm0, xmm3
     paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
+    WELS_AbsW  xmm2, xmm4
     paddusw        xmm6, xmm2
     SSE2_SumWHorizon1  xmm6, xmm4
-	movd           retrd,  xmm6
+    movd           retrd,  xmm6
     and            retrd,  0xffff
     shr            retrd,  1
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
  ;***********************************************************************
  ;
@@ -230,20 +230,20 @@
  ;
  ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm6,   xmm6
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
     SSE2_GetSatd8x8
     psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
  ;***********************************************************************
  ;
@@ -251,25 +251,25 @@
  ;
  ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x16_sse2
-	 %assign  push_num 0
-	 LOAD_4_PARA
-	 PUSH_XMM 8
-	 SIGN_EXTENSION r1, r1d
-	 SIGN_EXTENSION r3, r3d
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
 
-	 SSE2_GetSatd8x8
-     lea    r0,    [r0+2*r1]
-     lea    r2,    [r2+2*r3]
-	 SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSatd8x8
 
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    retrd,   xmm6
-	 POP_XMM
-	 LOAD_4_PARA_POP
-	 ret
+    psrlw   xmm6,  1
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -277,30 +277,30 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    push r0
+    push r2
+    pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
 
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
 
-	pop r2
-	pop r0
+    pop r2
+    pop r0
     add    r0,    8
     add    r2,    8
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
 
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    psrlw   xmm6,  1
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -308,38 +308,38 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x16_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    push r0
+    push r2
+    pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
 
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSatd8x8
 
-	pop r2
-	pop r0
-	add    r0,    8
-	add    r2,    8
+    pop r2
+    pop r0
+    add    r0,    8
+    add    r2,    8
 
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSatd8x8
 
  ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
     psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -355,9 +355,9 @@
 
 
 %macro SSE_DB_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubb %1, %2
+    pxor %1, %1
+    pcmpeqw %2, %2
+    psubb %1, %2
 %endmacro
 
 ;***********************************************************************
@@ -369,668 +369,668 @@
 WELS_EXTERN WelsSampleSatdThree4x4_sse2
 
 %ifdef X86_32
-	push r3
-	push r4
-	push r5
-	push r6
-	%assign  push_num 4
+    push r3
+    push r4
+    push r5
+    push r6
+    %assign  push_num 4
 %else
-	%assign  push_num 0
+    %assign  push_num 0
 %endif
-	PUSH_XMM 8
+    PUSH_XMM 8
 
-	mov  r2, arg3
-	mov  r3, arg4
-	SIGN_EXTENSION r3, r3d
+    mov  r2, arg3
+    mov  r3, arg4
+    SIGN_EXTENSION r3, r3d
 
-	; load source 4x4 samples and Hadamard transform
-	movd      xmm0, [r2]
-	movd      xmm1, [r2+r3]
-	lea       r2 , [r2+2*r3]
-	movd      xmm2, [r2]
-	movd      xmm3, [r2+r3]
-	punpckldq xmm0, xmm2
-	punpckldq xmm1, xmm3
+    ; load source 4x4 samples and Hadamard transform
+    movd      xmm0, [r2]
+    movd      xmm1, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm2, [r2]
+    movd      xmm3, [r2+r3]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
 
-	pxor      xmm6, xmm6
-	punpcklbw xmm0, xmm6
-	punpcklbw xmm1, xmm6
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
 
-	movdqa    xmm2, xmm0
-	paddw     xmm0, xmm1
-	psubw     xmm2, xmm1
-	SSE2_XSawp  qdq, xmm0, xmm2, xmm3
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
 
-	movdqa    xmm4, xmm0
-	paddw     xmm0, xmm3
-	psubw     xmm4, xmm3
+    movdqa    xmm4, xmm0
+    paddw     xmm0, xmm3
+    psubw     xmm4, xmm3
 
-	movdqa    xmm2, xmm0
-	punpcklwd xmm0, xmm4
-	punpckhwd xmm4, xmm2
+    movdqa    xmm2, xmm0
+    punpcklwd xmm0, xmm4
+    punpckhwd xmm4, xmm2
 
-	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
+    SSE2_XSawp  dq,  xmm0, xmm4, xmm3
+    SSE2_XSawp  qdq, xmm0, xmm3, xmm5
 
-	movdqa    xmm7, xmm0
-	paddw     xmm0, xmm5
-	psubw     xmm7, xmm5
+    movdqa    xmm7, xmm0
+    paddw     xmm0, xmm5
+    psubw     xmm7, xmm5
 
-	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
+    SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
 
-	; Hadamard transform results are saved in xmm0 and xmm2
-	movdqa    xmm2, xmm0
-	paddw     xmm0, xmm1
-	psubw     xmm2, xmm1
+    ; Hadamard transform results are saved in xmm0 and xmm2
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
 
-	;load top boundary samples: [a b c d]
-	mov r0, arg1
-	mov r1, arg2
-	SIGN_EXTENSION r1, r1d
-	sub r0, r1
+    ;load top boundary samples: [a b c d]
+    mov r0, arg1
+    mov r1, arg2
+    SIGN_EXTENSION r1, r1d
+    sub r0, r1
 %ifdef UNIX64
-	push r4
-	push r5
+    push r4
+    push r5
 %endif
 
-	movzx     r2d,  byte [r0]
-	movzx     r3d,  byte [r0+1]
-	movzx     r4d,  byte [r0+2]
-	movzx     r5d,  byte [r0+3]
+    movzx     r2d,  byte [r0]
+    movzx     r3d,  byte [r0+1]
+    movzx     r4d,  byte [r0+2]
+    movzx     r5d,  byte [r0+3]
 
-	; get the transform results of top boundary samples: [a b c d]
-	add       r3d, r2d ; r3d = a + b
-	add       r5d, r4d ; r5d = c + d
-	add       r2d, r2d ; r2d = a + a
-	add       r4d, r4d ; r4d = c + c
-	sub       r2d, r3d ; r2d = a + a - a - b = a - b
-	sub       r4d, r5d ; r4d = c + c - c - d = c - d
-	add       r5d, r3d ; r5d = (a + b) + (c + d)
-	add       r3d, r3d
-	sub       r3d, r5d ; r3d = (a + b) - (c + d)
-	add       r4d, r2d ; r4d = (a - b) + (c - d)
-	add       r2d, r2d
-	sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+    ; get the transform results of top boundary samples: [a b c d]
+    add       r3d, r2d ; r3d = a + b
+    add       r5d, r4d ; r5d = c + d
+    add       r2d, r2d ; r2d = a + a
+    add       r4d, r4d ; r4d = c + c
+    sub       r2d, r3d ; r2d = a + a - a - b = a - b
+    sub       r4d, r5d ; r4d = c + c - c - d = c - d
+    add       r5d, r3d ; r5d = (a + b) + (c + d)
+    add       r3d, r3d
+    sub       r3d, r5d ; r3d = (a + b) - (c + d)
+    add       r4d, r2d ; r4d = (a - b) + (c - d)
+    add       r2d, r2d
+    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
 
-	movdqa    xmm6, xmm0
-	movdqa    xmm7, xmm2
-	movd      xmm5, r5d ; store the edi for DC mode
-	pxor      xmm3, xmm3
-	pxor      xmm4, xmm4
-	pinsrw    xmm3, r5d, 0
-	pinsrw    xmm3, r4d, 4
-	psllw     xmm3, 2
-	pinsrw    xmm4, r3d, 0
-	pinsrw    xmm4, r2d, 4
-	psllw     xmm4, 2
+    movdqa    xmm6, xmm0
+    movdqa    xmm7, xmm2
+    movd      xmm5, r5d ; store the edi for DC mode
+    pxor      xmm3, xmm3
+    pxor      xmm4, xmm4
+    pinsrw    xmm3, r5d, 0
+    pinsrw    xmm3, r4d, 4
+    psllw     xmm3, 2
+    pinsrw    xmm4, r3d, 0
+    pinsrw    xmm4, r2d, 4
+    psllw     xmm4, 2
 
-	; get the satd of H
-	psubw     xmm0, xmm3
-	psubw     xmm2, xmm4
+    ; get the satd of H
+    psubw     xmm0, xmm3
+    psubw     xmm2, xmm4
 
-	WELS_AbsW  xmm0, xmm1
-	WELS_AbsW  xmm2, xmm1
-	paddusw        xmm0, xmm2
-	SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0
+    WELS_AbsW  xmm0, xmm1
+    WELS_AbsW  xmm2, xmm1
+    paddusw        xmm0, xmm2
+    SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0
 
-	;load left boundary samples: [a b c d]'
-	add r0, r1
+    ;load left boundary samples: [a b c d]'
+    add r0, r1
 
-	movzx     r2d,  byte [r0-1]
-	movzx     r3d,  byte [r0+r1-1]
-	lea       r0 , [r0+2*r1]
-	movzx     r4d,  byte [r0-1]
-	movzx     r5d,  byte [r0+r1-1]
+    movzx     r2d,  byte [r0-1]
+    movzx     r3d,  byte [r0+r1-1]
+    lea       r0 , [r0+2*r1]
+    movzx     r4d,  byte [r0-1]
+    movzx     r5d,  byte [r0+r1-1]
 
-	; get the transform results of left boundary samples: [a b c d]'
-	add       r3d, r2d ; r3d = a + b
-	add       r5d, r4d ; r5d = c + d
-	add       r2d, r2d ; r2d = a + a
-	add       r4d, r4d ; r4d = c + c
-	sub       r2d, r3d ; r2d = a + a - a - b = a - b
-	sub       r4d, r5d ; r4d = c + c - c - d = c - d
-	add       r5d, r3d ; r5d = (a + b) + (c + d)
-	add       r3d, r3d
-	sub       r3d, r5d ; r3d = (a + b) - (c + d)
-	add       r4d, r2d ; r4d = (a - b) + (c - d)
-	add       r2d, r2d
-	sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+    ; get the transform results of left boundary samples: [a b c d]'
+    add       r3d, r2d ; r3d = a + b
+    add       r5d, r4d ; r5d = c + d
+    add       r2d, r2d ; r2d = a + a
+    add       r4d, r4d ; r4d = c + c
+    sub       r2d, r3d ; r2d = a + a - a - b = a - b
+    sub       r4d, r5d ; r4d = c + c - c - d = c - d
+    add       r5d, r3d ; r5d = (a + b) + (c + d)
+    add       r3d, r3d
+    sub       r3d, r5d ; r3d = (a + b) - (c + d)
+    add       r4d, r2d ; r4d = (a - b) + (c - d)
+    add       r2d, r2d
+    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
 
-	; store the transform results in xmm3
-	movd      xmm3, r5d
-	pinsrw    xmm3, r3d, 1
-	pinsrw    xmm3, r2d, 2
-	pinsrw    xmm3, r4d, 3
-	psllw     xmm3, 2
+    ; store the transform results in xmm3
+    movd      xmm3, r5d
+    pinsrw    xmm3, r3d, 1
+    pinsrw    xmm3, r2d, 2
+    pinsrw    xmm3, r4d, 3
+    psllw     xmm3, 2
 
-	; get the satd of V
-	movdqa    xmm2, xmm6
-	movdqa    xmm4, xmm7
-	psubw     xmm2, xmm3
-	WELS_AbsW  xmm2, xmm1
-	WELS_AbsW  xmm4, xmm1
-	paddusw        xmm2, xmm4
-	SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2
+    ; get the satd of V
+    movdqa    xmm2, xmm6
+    movdqa    xmm4, xmm7
+    psubw     xmm2, xmm3
+    WELS_AbsW  xmm2, xmm1
+    WELS_AbsW  xmm4, xmm1
+    paddusw        xmm2, xmm4
+    SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2
 
-	; DC result is stored in xmm1
-	add       r5d, 4
-	movd      xmm1, r5d
-	paddw     xmm1, xmm5
-	psrlw     xmm1, 3
-	movdqa    xmm5, xmm1
-	psllw     xmm1, 4
+    ; DC result is stored in xmm1
+    add       r5d, 4
+    movd      xmm1, r5d
+    paddw     xmm1, xmm5
+    psrlw     xmm1, 3
+    movdqa    xmm5, xmm1
+    psllw     xmm1, 4
 
-	; get the satd of DC
-	psubw          xmm6, xmm1
-	WELS_AbsW  xmm6, xmm1
-	WELS_AbsW  xmm7, xmm1
-	paddusw        xmm6, xmm7
-	SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
+    ; get the satd of DC
+    psubw          xmm6, xmm1
+    WELS_AbsW  xmm6, xmm1
+    WELS_AbsW  xmm7, xmm1
+    paddusw        xmm6, xmm7
+    SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
 %ifdef UNIX64
-	pop r5
-	pop r4
+    pop r5
+    pop r4
 %endif
-	; comparing order: DC H V
+    ; comparing order: DC H V
 
-	mov  r4, arg5
-	movd      r2d, xmm6
-	movd      r3d, xmm2
-	movd      r6d, xmm0
+    mov  r4, arg5
+    movd      r2d, xmm6
+    movd      r3d, xmm2
+    movd      r6d, xmm0
 
-	and       r2d, 0xffff
-	shr       r2d, 1
-	and       r3d, 0xffff
-	shr       r3d, 1
-	and       r6d, 0xffff
-	shr       r6d, 1
-	add       r2d, dword arg7
-	add       r3d, dword arg8
-	add       r6d, dword arg9
-	cmp       r2w, r3w
-	jg near   not_dc
-	cmp       r2w, r6w
-	jg near   not_dc_h
+    and       r2d, 0xffff
+    shr       r2d, 1
+    and       r3d, 0xffff
+    shr       r3d, 1
+    and       r6d, 0xffff
+    shr       r6d, 1
+    add       r2d, dword arg7
+    add       r3d, dword arg8
+    add       r6d, dword arg9
+    cmp       r2w, r3w
+    jg near   not_dc
+    cmp       r2w, r6w
+    jg near   not_dc_h
 
-	; for DC mode
-	movd      r3d, xmm5
-	imul      r3d, 0x01010101
-	movd	  xmm5, r3d
-	pshufd    xmm5, xmm5, 0
-	movdqa    [r4], xmm5
-	mov r5, arg6
-	mov       dword [r5], 0x02
-	mov retrd, r2d
-	POP_XMM
+    ; for DC mode
+    movd      r3d, xmm5
+    imul      r3d, 0x01010101
+    movd      xmm5, r3d
+    pshufd    xmm5, xmm5, 0
+    movdqa    [r4], xmm5
+    mov r5, arg6
+    mov       dword [r5], 0x02
+    mov retrd, r2d
+    POP_XMM
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	ret
+    ret
 
 not_dc:
-	cmp       r3w, r6w
-	jg near   not_dc_h
+    cmp       r3w, r6w
+    jg near   not_dc_h
 
-	; for H mode
-	SSE_DB_1_2REG  xmm6, xmm7
-	sub        r0, r1
-	sub        r0, r1
-	movzx      r6d,  byte [r0-1]
-	movd       xmm0, r6d
-	pmuludq    xmm0, xmm6
+    ; for H mode
+    SSE_DB_1_2REG  xmm6, xmm7
+    sub        r0, r1
+    sub        r0, r1
+    movzx      r6d,  byte [r0-1]
+    movd       xmm0, r6d
+    pmuludq    xmm0, xmm6
 
-	movzx     r6d,  byte [r0+r1-1]
-	movd      xmm1, r6d
-	pmuludq   xmm1, xmm6
-	punpckldq xmm0, xmm1
+    movzx     r6d,  byte [r0+r1-1]
+    movd      xmm1, r6d
+    pmuludq   xmm1, xmm6
+    punpckldq xmm0, xmm1
 
-	lea       r0,	[r0+r1*2]
-	movzx	  r6d,	byte [r0-1]
-	movd	  xmm2,	r6d
-	pmuludq   xmm2, xmm6
+    lea       r0,   [r0+r1*2]
+    movzx     r6d,  byte [r0-1]
+    movd      xmm2, r6d
+    pmuludq   xmm2, xmm6
 
-	movzx	  r6d,	byte [r0+r1-1]
-	movd	  xmm3,	r6d
-	pmuludq   xmm3, xmm6
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
+    movzx     r6d,  byte [r0+r1-1]
+    movd      xmm3, r6d
+    pmuludq   xmm3, xmm6
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
 
-	movdqa	  [r4],xmm0
+    movdqa    [r4],xmm0
 
-	mov       retrd, r3d
-	mov r5, arg6
-	mov       dword [r5], 0x01
-	POP_XMM
+    mov       retrd, r3d
+    mov r5, arg6
+    mov       dword [r5], 0x01
+    POP_XMM
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	ret
+    ret
 not_dc_h:
-	sub        r0, r1
-	sub        r0, r1
-	sub        r0, r1
-	movd	  xmm0,	[r0]
-	pshufd	  xmm0,	xmm0, 0
-	movdqa	  [r4],xmm0
-	mov       retrd, r6d
-	mov r5, arg6
-	mov       dword [r5], 0x00
-	POP_XMM
+    sub        r0, r1
+    sub        r0, r1
+    sub        r0, r1
+    movd      xmm0, [r0]
+    pshufd    xmm0, xmm0, 0
+    movdqa    [r4],xmm0
+    mov       retrd, r6d
+    mov r5, arg6
+    mov       dword [r5], 0x00
+    POP_XMM
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	ret
+    ret
 
 
 %macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
+    pmaddubsw    %1, xmm5
+    movdqa       %2, %1
+    pmaddwd      %1, xmm7
+    pmaddwd      %2, xmm6
+    movdqa       %3, %1
+    punpckldq    %1, %2
+    punpckhdq    %2, %3
+    movdqa       %3, %1
+    punpcklqdq   %1, %2
+    punpckhqdq   %3, %2
+    paddd        xmm4, %1 ;for dc
+    paddd        xmm4, %3 ;for dc
+    packssdw     %1, %3
+    psllw        %1, 2
 %endmacro
 %macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
+    pmaddubsw    %1, xmm5
+    movdqa       %2, %1
+    pmaddwd      %1, xmm7
+    pmaddwd      %2, xmm6
+    movdqa       %3, %1
+    punpckldq    %1, %2
+    punpckhdq    %2, %3
+    movdqa       %3, %1
+    punpcklqdq   %1, %2
+    punpckhqdq   %3, %2
 ;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
+;    paddd        xmm4, %3 ;for dc
+    movdqa       %4, %1
+    punpcklqdq   %4, %3
+    packssdw     %1, %3
+    psllw        %1, 2
 %endmacro
 
 %macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [r2]
-	movq        xmm1,   [r2+r3]
-	lea         r2,    [r2+2*r3]
-	movq        xmm2,   [r2]
-	movq        xmm3,   [r2+r3]
-	lea         r2,    [r2+2*r3]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
+    pxor        xmm7,   xmm7
+    movq        xmm0,   [r2]
+    movq        xmm1,   [r2+r3]
+    lea         r2,    [r2+2*r3]
+    movq        xmm2,   [r2]
+    movq        xmm3,   [r2+r3]
+    lea         r2,    [r2+2*r3]
+    punpcklbw   xmm0,   xmm7
+    punpcklbw   xmm1,   xmm7
+    punpcklbw   xmm2,   xmm7
+    punpcklbw   xmm3,   xmm7
+    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+    ;doesn't need another transpose
 %endmacro
 
 %macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2],   0
-	pinsrw      xmm0,   word[r6+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2+2],  0
-	pinsrw      xmm0,   word[r6+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2+4],  0
-	pinsrw      xmm0,   word[r6+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2+6],  0
-	pinsrw      xmm0,   word[r6+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2],   0
+    pinsrw      xmm0,   word[r6+%2+8], 4
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2+2],  0
+    pinsrw      xmm0,   word[r6+%2+10], 4
+    psubsw      xmm0,   xmm1
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2+4],  0
+    pinsrw      xmm0,   word[r6+%2+12], 4
+    psubsw      xmm0,   xmm3
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2+6],  0
+    pinsrw      xmm0,   word[r6+%2+14], 4
+    psubsw      xmm0,   xmm2
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
 %endmacro
 %macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [r6+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
+    movq        xmm0,   [r6+%3+8*%1]
+    punpcklqdq  xmm0,   xmm0
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm5,   xmm0
+    pabsw       xmm1,   xmm1
+    pabsw       xmm2,   xmm2
+    pabsw       xmm3,   xmm3
+    paddw       xmm2,   xmm1;for DC
+    paddw       xmm2,   xmm3;for DC
+    paddw       xmm5,   xmm2
 %endmacro
 %macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
+    pxor        xmm0,   xmm0
+    movq2dq     xmm0,   mm4
+    punpcklqdq  xmm0,   xmm0
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm6,   xmm0
+    paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [r6+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
+    shl         %1,     4
+    movdqa      xmm0,   [r6+32+%1]
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm6,   xmm0
+    paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
+    SSE41_GetX38x4SatdDec
+    SSE41_GetX38x4SatdV   %1, %2
+    SSE41_GetX38x4SatdH   %1, %2, 32
+    SSE41_I16X16GetX38x4SatdDC
 %endmacro
 %macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
+    SSE41_GetX38x4SatdDec
+    SSE41_GetX38x4SatdV   %1, %2
+    SSE41_GetX38x4SatdH   %1, %2, 16
+    SSE41_ChromaGetX38x4SatdDC %1
 %endmacro
 %macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
+    pmaddwd     %1, %2
+    movhlps     %3, %1
+    paddd       %1, %3
+    pshuflw     %3, %1,0Eh
+    paddd       %1, %3
 %endmacro
 
 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-	%assign  push_num 0
-	LOAD_7_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	SIGN_EXTENSION r5, r5d
+    %assign  push_num 0
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
 
 %ifndef X86_32
-	push r12
-	mov  r12, r2
+    push r12
+    mov  r12, r2
 %endif
 
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         r0,    r1
-	movdqu		xmm0,   [r0]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [r6],  xmm0 ;V
-	movdqa      [r6+16], xmm1
-	add         r0,    r1
-	pinsrb      xmm0,   byte[r0-1], 0
-	pinsrb      xmm0,   byte[r0+r1-1], 1
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     2
-	pinsrb      xmm0,   byte[r0+r1-1], 3
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     4
-	pinsrb      xmm0,   byte[r0+r1-1], 5
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     6
-	pinsrb      xmm0,   byte[r0+r1-1], 7
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     8
-	pinsrb      xmm0,   byte[r0+r1-1], 9
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     10
-	pinsrb      xmm0,   byte[r0+r1-1], 11
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     12
-	pinsrb      xmm0,   byte[r0+r1-1], 13
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     14
-	pinsrb      xmm0,   byte[r0+r1-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [r6+32], xmm0 ;H
-	movdqa      [r6+48], xmm1
-	movd        r0d,    xmm4 ;dc
-	add         r0d,    16   ;(sum+16)
-	shr         r0d,    5    ;((sum+16)>>5)
-	shl         r0d,    4    ;
-	movd        mm4,    r0d  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
+    pxor        xmm4,   xmm4
+    movdqa      xmm5,   [HSumSubDB1]
+    movdqa      xmm6,   [HSumSubDW1]
+    movdqa      xmm7,   [PDW1]
+    sub         r0,    r1
+    movdqu      xmm0,   [r0]
+    movhlps     xmm1,   xmm0
+    punpcklqdq  xmm0,   xmm0
+    punpcklqdq  xmm1,   xmm1
+    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+    movdqa      [r6],  xmm0 ;V
+    movdqa      [r6+16], xmm1
+    add         r0,    r1
+    pinsrb      xmm0,   byte[r0-1], 0
+    pinsrb      xmm0,   byte[r0+r1-1], 1
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     2
+    pinsrb      xmm0,   byte[r0+r1-1], 3
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     4
+    pinsrb      xmm0,   byte[r0+r1-1], 5
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     6
+    pinsrb      xmm0,   byte[r0+r1-1], 7
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     8
+    pinsrb      xmm0,   byte[r0+r1-1], 9
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     10
+    pinsrb      xmm0,   byte[r0+r1-1], 11
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     12
+    pinsrb      xmm0,   byte[r0+r1-1], 13
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     14
+    pinsrb      xmm0,   byte[r0+r1-1], 15
+    movhlps     xmm1,   xmm0
+    punpcklqdq  xmm0,   xmm0
+    punpcklqdq  xmm1,   xmm1
+    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+    movdqa      [r6+32], xmm0 ;H
+    movdqa      [r6+48], xmm1
+    movd        r0d,    xmm4 ;dc
+    add         r0d,    16   ;(sum+16)
+    shr         r0d,    5    ;((sum+16)>>5)
+    shl         r0d,    4    ;
+    movd        mm4,    r0d  ; mm4 copy DC
+    pxor        xmm4,   xmm4 ;V
+    pxor        xmm5,   xmm5 ;H
+    pxor        xmm6,   xmm6 ;DC
 %ifdef UNIX64
-	push r4
+    push r4
 %endif
-	mov         r0,    0
-	mov         r4,    0
+    mov         r0,    0
+    mov         r4,    0
 
 .loop16x16_get_satd:
 .loopStart1:
-	SSE41_I16x16GetX38x4Satd r0, r4
-	inc          r0
-	cmp         r0, 4
-	jl          .loopStart1
-	cmp         r4, 16
-	je          .loop16x16_get_satd_end
+    SSE41_I16x16GetX38x4Satd r0, r4
+    inc          r0
+    cmp         r0, 4
+    jl          .loopStart1
+    cmp         r4, 16
+    je          .loop16x16_get_satd_end
 %ifdef X86_32
-	mov r2, arg3
+    mov r2, arg3
 %else
-	mov r2, r12
+    mov r2, r12
 %endif
-	add         r2, 8
-	mov         r0, 0
-	add         r4, 16
-	jmp         .loop16x16_get_satd
+    add         r2, 8
+    mov         r0, 0
+    add         r4, 16
+    jmp         .loop16x16_get_satd
  .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
+    MMX_DW_1_2REG    xmm0, xmm1
+    psrlw       xmm4, 1 ;/2
+    psrlw       xmm5, 1 ;/2
+    psrlw       xmm6, 1 ;/2
+    SSE41_HSum8W     xmm4, xmm0, xmm1
+    SSE41_HSum8W     xmm5, xmm0, xmm1
+    SSE41_HSum8W     xmm6, xmm0, xmm1
 
 %ifdef UNIX64
-	pop r4
+    pop r4
 %endif
-	; comparing order: DC H V
-	movd      r3d, xmm6 ;DC
-	movd      r1d, xmm5 ;H
-	movd      r0d, xmm4 ;V
+    ; comparing order: DC H V
+    movd      r3d, xmm6 ;DC
+    movd      r1d, xmm5 ;H
+    movd      r0d, xmm4 ;V
 %ifndef X86_32
-	pop r12
+    pop r12
 %endif
-	shl       r5d, 1
-	add       r1d, r5d
-	add       r3d, r5d
-	mov       r4, arg5
-	cmp       r3d, r1d
-	jge near   not_dc_16x16
-	cmp        r3d, r0d
-	jge near   not_dc_h_16x16
+    shl       r5d, 1
+    add       r1d, r5d
+    add       r3d, r5d
+    mov       r4, arg5
+    cmp       r3d, r1d
+    jge near   not_dc_16x16
+    cmp        r3d, r0d
+    jge near   not_dc_h_16x16
 
-	; for DC mode
-	mov       dword[r4], 2;I16_PRED_DC
-	mov       retrd, r3d
-	jmp near return_satd_intra_16x16_x3
+    ; for DC mode
+    mov       dword[r4], 2;I16_PRED_DC
+    mov       retrd, r3d
+    jmp near return_satd_intra_16x16_x3
 not_dc_16x16:
-	; for H mode
-	cmp       r1d, r0d
-	jge near   not_dc_h_16x16
-	mov       dword[r4], 1;I16_PRED_H
-	mov       retrd, r1d
-	jmp near return_satd_intra_16x16_x3
+    ; for H mode
+    cmp       r1d, r0d
+    jge near   not_dc_h_16x16
+    mov       dword[r4], 1;I16_PRED_H
+    mov       retrd, r1d
+    jmp near return_satd_intra_16x16_x3
 not_dc_h_16x16:
-	; for V mode
-	mov       dword[r4], 0;I16_PRED_V
-	mov       retrd, r0d
+    ; for V mode
+    mov       dword[r4], 0;I16_PRED_V
+    mov       retrd, r0d
 return_satd_intra_16x16_x3:
-	WELSEMMS
-	POP_XMM
-	LOAD_7_PARA_POP
+    WELSEMMS
+    POP_XMM
+    LOAD_7_PARA_POP
 ret
 
 %macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         r0,    r1
-	movq		xmm0,   [r0]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [r6],  xmm0 ;V
-	add         r0,    r1
-	pinsrb      xmm0,   byte[r0-1], 0
-	pinsrb      xmm0,   byte[r0+r1-1], 1
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     2
-	pinsrb      xmm0,   byte[r0+r1-1], 3
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     4
-	pinsrb      xmm0,   byte[r0+r1-1], 5
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     6
-	pinsrb      xmm0,   byte[r0+r1-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [r6+16], xmm0 ;H
+    movdqa      xmm5,   [HSumSubDB1]
+    movdqa      xmm6,   [HSumSubDW1]
+    movdqa      xmm7,   [PDW1]
+    sub         r0,    r1
+    movq        xmm0,   [r0]
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+    movdqa      [r6],  xmm0 ;V
+    add         r0,    r1
+    pinsrb      xmm0,   byte[r0-1], 0
+    pinsrb      xmm0,   byte[r0+r1-1], 1
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     2
+    pinsrb      xmm0,   byte[r0+r1-1], 3
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     4
+    pinsrb      xmm0,   byte[r0+r1-1], 5
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     6
+    pinsrb      xmm0,   byte[r0+r1-1], 7
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+    movdqa      [r6+16], xmm0 ;H
 ;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
+    movdqa      xmm6,   [PDQ2]
+    movdqa      xmm5,   xmm4
+    punpckhqdq  xmm5,   xmm1
+    paddd       xmm5,   xmm6
+    psrld       xmm5,   2
 ;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
+    paddd       xmm6,   xmm6
+    paddd       xmm4,   xmm1
+    paddd       xmm4,   xmm6
+    psrld       xmm4,   3
 ;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
+    pslld       xmm5,   4
+    pslld       xmm4,   4
 ;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [r6+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [r6+48], xmm5
+    movdqa      xmm6,   xmm4
+    punpcklqdq  xmm4,   xmm5
+    psllq       xmm4,   32
+    psrlq       xmm4,   32
+    movdqa      [r6+32], xmm4
+    punpckhqdq  xmm5,   xmm6
+    psllq       xmm5,   32
+    psrlq       xmm5,   32
+    movdqa      [r6+48], xmm5
 
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         r0,    0
-	SSE41_ChromaGetX38x4Satd r0, 0
-	inc             r0
-	SSE41_ChromaGetX38x4Satd r0, 0
+    pxor        xmm4,   xmm4 ;V
+    pxor        xmm5,   xmm5 ;H
+    pxor        xmm6,   xmm6 ;DC
+    mov         r0,    0
+    SSE41_ChromaGetX38x4Satd r0, 0
+    inc             r0
+    SSE41_ChromaGetX38x4Satd r0, 0
 %endmacro
 
 %macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
+    movdq2q     %2, %1
+    movhlps     %1, %1
+    movdq2q     %3, %1
 %endmacro
 %macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
+    movq2dq     %1, %3
+    movq2dq     %2, %4
+    punpcklqdq  %1, %2
 %endmacro
 ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
 
 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-	%assign  push_num 0
-	LOAD_7_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	SIGN_EXTENSION r5, r5d
+    %assign  push_num 0
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
 loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov r0,     arg8
-	mov r2,     arg9
+    SSE41_ChromaGetX38x8Satd
+    SSEReg2MMX  xmm4, mm0,mm1
+    SSEReg2MMX  xmm5, mm2,mm3
+    SSEReg2MMX  xmm6, mm5,mm6
+    mov r0,     arg8
+    mov r2,     arg9
 
-	SSE41_ChromaGetX38x8Satd
+    SSE41_ChromaGetX38x8Satd
 
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+    MMXReg2SSE  xmm0, xmm3, mm0, mm1
+    MMXReg2SSE  xmm1, xmm3, mm2, mm3
+    MMXReg2SSE  xmm2, xmm3, mm5, mm6
 
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
+    paddw       xmm4, xmm0
+    paddw       xmm5, xmm1
+    paddw       xmm6, xmm2
 
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      r3d, xmm6 ;DC
-	movd      r1d, xmm5 ;H
-	movd      r0d, xmm4 ;V
+    MMX_DW_1_2REG    xmm0, xmm1
+    psrlw       xmm4, 1 ;/2
+    psrlw       xmm5, 1 ;/2
+    psrlw       xmm6, 1 ;/2
+    SSE41_HSum8W     xmm4, xmm0, xmm1
+    SSE41_HSum8W     xmm5, xmm0, xmm1
+    SSE41_HSum8W     xmm6, xmm0, xmm1
+    ; comparing order: DC H V
+    movd      r3d, xmm6 ;DC
+    movd      r1d, xmm5 ;H
+    movd      r0d, xmm4 ;V
 
 
-	shl       r5d, 1
-	add       r1d, r5d
-	add       r0d, r5d
-	cmp       r3d, r1d
-	jge near   not_dc_8x8
-	cmp        r3d, r0d
-	jge near   not_dc_h_8x8
+    shl       r5d, 1
+    add       r1d, r5d
+    add       r0d, r5d
+    cmp       r3d, r1d
+    jge near   not_dc_8x8
+    cmp        r3d, r0d
+    jge near   not_dc_h_8x8
 
-	; for DC mode
-	mov       dword[r4], 0;I8_PRED_DC
-	mov       retrd, r3d
-	jmp near return_satd_intra_8x8_x3
+    ; for DC mode
+    mov       dword[r4], 0;I8_PRED_DC
+    mov       retrd, r3d
+    jmp near return_satd_intra_8x8_x3
 not_dc_8x8:
-	; for H mode
-	cmp       r1d, r0d
-	jge near   not_dc_h_8x8
-	mov       dword[r4], 1;I8_PRED_H
-	mov       retrd, r1d
-	jmp near return_satd_intra_8x8_x3
+    ; for H mode
+    cmp       r1d, r0d
+    jge near   not_dc_h_8x8
+    mov       dword[r4], 1;I8_PRED_H
+    mov       retrd, r1d
+    jmp near return_satd_intra_8x8_x3
 not_dc_h_8x8:
-	; for V mode
-	mov       dword[r4], 2;I8_PRED_V
-	mov       retrd, r0d
+    ; for V mode
+    mov       dword[r4], 2;I8_PRED_V
+    mov       retrd, r0d
 return_satd_intra_8x8_x3:
-	WELSEMMS
-	POP_XMM
-	LOAD_7_PARA_POP
+    WELSEMMS
+    POP_XMM
+    LOAD_7_PARA_POP
 ret
 
 
@@ -1040,22 +1040,22 @@
 ;
 ;***********************************************************************
 %macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
+    movd        xmm6,%1
+    pshufb      xmm6,xmm1
+    movdqa      %1,  xmm6
+    movdqa      xmm0,%2
+    psadbw      xmm0,xmm7
+    paddw       xmm4,xmm0
+    movdqa      xmm0,%2
+    psadbw      xmm0,xmm5
+    paddw       xmm2,xmm0
+    psadbw      xmm6,%2
+    paddw       xmm3,xmm6
 %endmacro
 %macro WelsAddDCValue 4
-  movzx   %2, byte %1
-  mov    %3, %2
-  add     %4, %2
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
 %endmacro
 
 ;***********************************************************************
@@ -1064,138 +1064,138 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-	%assign  push_num 0
-	LOAD_7_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	SIGN_EXTENSION r5, r5d
+    %assign  push_num 0
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
 
-	push  r5
-	push  r4
-	push  r3
+    push  r5
+    push  r4
+    push  r3
 
-	sub    r0,    r1
-	movdqa      xmm5,[r0]
-	pxor        xmm0,xmm0
-	psadbw      xmm0,xmm5
-	movhlps     xmm1,xmm0
-	paddw       xmm0,xmm1
-	movd        r5d, xmm0
+    sub    r0,    r1
+    movdqa      xmm5,[r0]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        r5d, xmm0
 
-	add         r0,r1
-	lea         r3,[r1+2*r1]    ;ebx r3
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	lea         r0, [r0+4*r1]
-	add         r6, 64
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	lea         r0, [r0+4*r1]
-	add         r6, 64
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	lea         r0, [r0+4*r1]
-	add         r6, 64
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	sub         r6, 192
-	add         r5d,10h
-	shr         r5d,5
-	movd        xmm7,r5d
-	pxor        xmm1,xmm1
-	pshufb      xmm7,xmm1
-	pxor        xmm4,xmm4
-	pxor        xmm3,xmm3
-	pxor        xmm2,xmm2
-	;sad begin
-	pop   r3
-	lea         r4, [r3+2*r3] ;esi r4
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
-	add         r6, 64
-	lea         r2, [r2+4*r3]
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
-	add         r6, 64
-	lea         r2, [r2+4*r3]
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
-	add         r6, 64
-	lea         r2, [r2+4*r3]
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r0,r1
+    lea         r3,[r1+2*r1]    ;ebx r3
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    lea         r0, [r0+4*r1]
+    add         r6, 64
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    lea         r0, [r0+4*r1]
+    add         r6, 64
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    lea         r0, [r0+4*r1]
+    add         r6, 64
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    sub         r6, 192
+    add         r5d,10h
+    shr         r5d,5
+    movd        xmm7,r5d
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+    ;sad begin
+    pop   r3
+    lea         r4, [r3+2*r3] ;esi r4
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r6, 64
+    lea         r2, [r2+4*r3]
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r6, 64
+    lea         r2, [r2+4*r3]
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r6, 64
+    lea         r2, [r2+4*r3]
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
 
-	pop r4
-	pop r5
-	pslldq      xmm3,4
-	por         xmm3,xmm2
-	movhlps     xmm1,xmm3
-	paddw       xmm3,xmm1
-	movhlps     xmm0,xmm4
-	paddw       xmm4,xmm0
-	; comparing order: DC H V
-	movd        r1d, xmm4 ;DC   ;ebx r1d
-	movd        r0d, xmm3 ;V    ;ecx r0d
-	psrldq      xmm3, 4
-	movd        r2d, xmm3 ;H    ;esi r2d
+    pop r4
+    pop r5
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+    ; comparing order: DC H V
+    movd        r1d, xmm4 ;DC   ;ebx r1d
+    movd        r0d, xmm3 ;V    ;ecx r0d
+    psrldq      xmm3, 4
+    movd        r2d, xmm3 ;H    ;esi r2d
 
-	;mov         eax, [esp+36] ;lamda ;eax r5
-	shl         r5d, 1
-	add         r2d, r5d
-	add         r1d, r5d
-	;mov         edx, [esp+32]  ;edx r4
-	cmp         r1d, r2d
-	jge near   not_dc_16x16_sad
-	cmp        r1d, r0d
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[r4], 2;I16_PRED_DC
-	mov       retrd, r1d
-	sub        r6, 192
+    ;mov         eax, [esp+36] ;lamda ;eax r5
+    shl         r5d, 1
+    add         r2d, r5d
+    add         r1d, r5d
+    ;mov         edx, [esp+32]  ;edx r4
+    cmp         r1d, r2d
+    jge near   not_dc_16x16_sad
+    cmp        r1d, r0d
+    jge near   not_dc_h_16x16_sad
+    ; for DC mode
+    mov       dword[r4], 2;I16_PRED_DC
+    mov       retrd, r1d
+    sub        r6, 192
 %assign x 0
 %rep 16
-	movdqa    [r6+16*x], xmm7
+    movdqa    [r6+16*x], xmm7
 %assign x x+1
 %endrep
-	jmp near return_sad_intra_16x16_x3
+    jmp near return_sad_intra_16x16_x3
 not_dc_16x16_sad:
-	; for H mode
-	cmp       r2d, r0d
-	jge near   not_dc_h_16x16_sad
-	mov       dword[r4], 1;I16_PRED_H
-	mov       retrd, r2d
-	jmp near return_sad_intra_16x16_x3
+    ; for H mode
+    cmp       r2d, r0d
+    jge near   not_dc_h_16x16_sad
+    mov       dword[r4], 1;I16_PRED_H
+    mov       retrd, r2d
+    jmp near return_sad_intra_16x16_x3
 not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[r4], 0;I16_PRED_V
-	mov       retrd, r0d
-	sub       r6, 192
+    ; for V mode
+    mov       dword[r4], 0;I16_PRED_V
+    mov       retrd, r0d
+    sub       r6, 192
 %assign x 0
 %rep 16
-	movdqa    [r6+16*x], xmm5
+    movdqa    [r6+16*x], xmm5
 %assign x x+1
 %endrep
 return_sad_intra_16x16_x3:
-	POP_XMM
-	LOAD_7_PARA_POP
-	ret
+    POP_XMM
+    LOAD_7_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -1210,63 +1210,63 @@
 
 ;SSE4.1
 %macro SSE41_GetSatd8x4 0
-	movq             xmm0, [r0]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [r0+r1]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [r2]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r2+r3]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [r0+2*r1]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r0+r4]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [r2+2*r3]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [r2+r5]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
+    movq             xmm0, [r0]
+    punpcklqdq       xmm0, xmm0
+    pmaddubsw        xmm0, xmm7
+    movq             xmm1, [r0+r1]
+    punpcklqdq       xmm1, xmm1
+    pmaddubsw        xmm1, xmm7
+    movq             xmm2, [r2]
+    punpcklqdq       xmm2, xmm2
+    pmaddubsw        xmm2, xmm7
+    movq             xmm3, [r2+r3]
+    punpcklqdq       xmm3, xmm3
+    pmaddubsw        xmm3, xmm7
+    psubsw           xmm0, xmm2
+    psubsw           xmm1, xmm3
+    movq             xmm2, [r0+2*r1]
+    punpcklqdq       xmm2, xmm2
+    pmaddubsw        xmm2, xmm7
+    movq             xmm3, [r0+r4]
+    punpcklqdq       xmm3, xmm3
+    pmaddubsw        xmm3, xmm7
+    movq             xmm4, [r2+2*r3]
+    punpcklqdq       xmm4, xmm4
+    pmaddubsw        xmm4, xmm7
+    movq             xmm5, [r2+r5]
+    punpcklqdq       xmm5, xmm5
+    pmaddubsw        xmm5, xmm7
+    psubsw           xmm2, xmm4
+    psubsw           xmm3, xmm5
+    SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+    pabsw            xmm0, xmm0
+    pabsw            xmm2, xmm2
+    pabsw            xmm1, xmm1
+    pabsw            xmm3, xmm3
+    movdqa           xmm4, xmm3
+    pblendw          xmm3, xmm1, 0xAA
+    pslld            xmm1, 16
+    psrld            xmm4, 16
+    por              xmm1, xmm4
+    pmaxuw           xmm1, xmm3
+    paddw            xmm6, xmm1
+    movdqa           xmm4, xmm0
+    pblendw          xmm0, xmm2, 0xAA
+    pslld            xmm2, 16
+    psrld            xmm4, 16
+    por              xmm2, xmm4
+    pmaxuw           xmm0, xmm2
+    paddw            xmm6, xmm0
 %endmacro
 
 %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
+    MMX_DW_1_2REG    %3, %4
+    pmaddwd     %2, %3
+    movhlps     %4, %2
+    paddd       %2, %4
+    pshuflw     %4, %2,0Eh
+    paddd       %2, %4
+    movd        %1, %2
 %endmacro
 ;***********************************************************************
 ;
@@ -1274,53 +1274,53 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[r2+r3*2]
-	lea         r2, [r3*2+r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[r0+r1*2]
-	lea         r0, [r1*2+r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqa      xmm4,[HSwapSumSubDB1]
+    movd        xmm2,[r2]
+    movd        xmm5,[r2+r3]
+    shufps      xmm2,xmm5,0
+    movd        xmm3,[r2+r3*2]
+    lea         r2, [r3*2+r2]
+    movd        xmm5,[r2+r3]
+    shufps      xmm3,xmm5,0
+    movd        xmm0,[r0]
+    movd        xmm5,[r0+r1]
+    shufps      xmm0,xmm5,0
+    movd        xmm1,[r0+r1*2]
+    lea         r0, [r1*2+r0]
+    movd        xmm5,[r0+r1]
+    shufps      xmm1,xmm5,0
+    pmaddubsw   xmm0,xmm4
+    pmaddubsw   xmm1,xmm4
+    pmaddubsw   xmm2,xmm4
+    pmaddubsw   xmm3,xmm4
+    psubw       xmm0,xmm2
+    psubw       xmm1,xmm3
+    movdqa      xmm2,xmm0
+    paddw       xmm0,xmm1
+    psubw       xmm1,xmm2
+    movdqa      xmm2,xmm0
+    punpcklqdq  xmm0,xmm1
+    punpckhqdq  xmm2,xmm1
+    movdqa      xmm1,xmm0
+    paddw       xmm0,xmm2
+    psubw       xmm2,xmm1
+    movdqa      xmm1,xmm0
+    pblendw     xmm0,xmm2,0AAh
+    pslld       xmm2,16
+    psrld       xmm1,16
+    por         xmm2,xmm1
+    pabsw       xmm0,xmm0
+    pabsw       xmm2,xmm2
+    pmaxsw      xmm0,xmm2
+    SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -1329,30 +1329,30 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x8_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
+    push  r4
+    push  r5
 %endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			r0,	 [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    %assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6, xmm6
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    SSE41_GetSatd8x4
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r5
-	pop  r4
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1361,36 +1361,36 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x16_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
+    push  r4
+    push  r5
+    push  r6
 %endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor        xmm6, xmm6
-	mov         r6,    0
+    %assign  push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6, xmm6
+    mov         r6,    0
 loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    inc         r6
+    cmp         r6,  4
+    jl          loop_get_satd_8x16
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
+    pop  r6
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1399,42 +1399,42 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x8_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
+    push  r4
+    push  r5
 %endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push  r0
-	push  r2
+    %assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    push  r0
+    push  r2
 
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6,   xmm6
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    SSE41_GetSatd8x4
 
-	pop  r2
-	pop  r0
-	add			r0,    8
-	add			r2,    8
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    pop  r2
+    pop  r0
+    add         r0,    8
+    add         r2,    8
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    SSE41_GetSatd8x4
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r5
-	pop  r4
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1444,53 +1444,53 @@
 
 WELS_EXTERN WelsSampleSatd16x16_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
+    push  r4
+    push  r5
+    push  r6
 %endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
+    %assign  push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
 
-	push  r0
-	push  r2
+    push  r0
+    push  r2
 
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	mov         r6,    0
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6,   xmm6
+    mov         r6,    0
 loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_left
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    inc         r6
+    cmp         r6,  4
+    jl          loop_get_satd_16x16_left
 
-	pop  r2
-	pop  r0
-	add			r0,    8
-	add			r2,    8
-	mov         r6,    0
+    pop  r2
+    pop  r0
+    add         r0,    8
+    add         r2,    8
+    mov         r6,    0
 loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    inc         r6
+    cmp         r6,  4
+    jl          loop_get_satd_16x16_right
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
+    pop  r6
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1505,55 +1505,55 @@
 ;***********************************************************************
 
 %macro SSE2_GetSad2x16 0
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqu xmm1,   [r2]
-	MOVDQ  xmm2,   [r0];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqu xmm1,   [r2]
+    MOVDQ  xmm2,   [r0];[eax] must aligned 16
+    psadbw xmm1,   xmm2
+    paddw  xmm0,   xmm1
+    movdqu xmm1,   [r2+r3]
+    MOVDQ  xmm2,   [r0+r1]
+    psadbw xmm1,   xmm2
+    paddw  xmm0,   xmm1
 %endmacro
 
 
 %macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+2*r3]
-	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+r5]
-	MOVDQ  xmm2,   [r0+r4]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
+    movdqu xmm0,   [r2]
+    MOVDQ  xmm2,   [r0]
+    psadbw xmm0,   xmm2
+    paddw  xmm7,   xmm0
+    movdqu xmm1,   [r2+r3]
+    MOVDQ  xmm2,   [r0+r1]
+    psadbw xmm1,   xmm2
+    paddw  xmm7,   xmm1
+    movdqu xmm1,   [r2+2*r3]
+    MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+    psadbw xmm1,   xmm2
+    paddw  xmm7,   xmm1
+    movdqu xmm1,   [r2+r5]
+    MOVDQ  xmm2,   [r0+r4]
+    psadbw xmm1,   xmm2
+    paddw  xmm7,   xmm1
 %endmacro
 
 
 %macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
+    movq   xmm0,   [r0]
+    movq   xmm1,   [r0+r1]
+    lea    r0,     [r0+2*r1]
+    movhps xmm0,   [r0]
+    movhps xmm1,   [r0+r1]
 
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
+    movq   xmm2,   [r2]
+    movq   xmm3,   [r2+r3]
+    lea    r2,     [r2+2*r3]
+    movhps xmm2,   [r2]
+    movhps xmm3,   [r2+r3]
+    psadbw xmm0,   xmm2
+    psadbw xmm1,   xmm3
+    paddw  xmm6,   xmm0
+    paddw  xmm6,   xmm1
 %endmacro
 
 ;***********************************************************************
@@ -1565,39 +1565,39 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad16x16_sse2
 %ifdef X86_32
-	push  r4
-	push  r5
+    push  r4
+    push  r5
 %endif
 
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	lea r4, [3*r1]
-	lea r5, [3*r3]
+    %assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    lea r4, [3*r1]
+    lea r5, [3*r3]
 
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd retrd, xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
+    pxor   xmm7,   xmm7
+    SSE2_GetSad4x16
+    lea    r0,  [r0+4*r1]
+    lea    r2,  [r2+4*r3]
+    SSE2_GetSad4x16
+    lea    r0,  [r0+4*r1]
+    lea    r2,  [r2+4*r3]
+    SSE2_GetSad4x16
+    lea    r0,  [r0+4*r1]
+    lea    r2,  [r2+4*r3]
+    SSE2_GetSad4x16
+    movhlps xmm0, xmm7
+    paddw xmm0, xmm7
+    movd retrd, xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r5
-	pop  r4
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1607,55 +1607,55 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad16x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
+    %assign  push_num 0
+    LOAD_4_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqu xmm0,   [r2]
+    MOVDQ  xmm2,   [r0]
+    psadbw xmm0,   xmm2
+    movdqu xmm1,   [r2+r3]
+    MOVDQ  xmm2,   [r0+r1]
+    psadbw xmm1,   xmm2
+    paddw  xmm0,   xmm1
 
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
+    SSE2_GetSad2x16
+    SSE2_GetSad2x16
+    SSE2_GetSad2x16
 
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
+    movhlps     xmm1, xmm0
+    paddw       xmm0, xmm1
+    movd        retrd,  xmm0
+    LOAD_4_PARA_POP
+    ret
 
 
 
 WELS_EXTERN WelsSampleSad8x16_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 7
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
     pxor   xmm6,   xmm6
 
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
     SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
     SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
 
     movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    paddw      xmm0, xmm6
+    movd       retrd,  xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 
 %macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
@@ -1664,22 +1664,22 @@
 %endmacro
 
 WELS_EXTERN WelsSampleSad8x8_sse21
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
+    %assign  push_num 0
+    mov     r2,  arg3
+    push    r2
+    CACHE_SPLIT_CHECK r2, 8, 64
+    jle    near   .pixel_sad_8x8_nsplit
+    pop     r2
 %ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
+    push    r3
+    push    r4
+    push    r5
 %endif
-	%assign  push_num 3
-	PUSH_XMM 8
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENSION r1, r1d
+    %assign  push_num 3
+    PUSH_XMM 8
+    mov     r0,  arg1
+    mov     r1,  arg2
+    SIGN_EXTENSION r1, r1d
     pxor   xmm7,   xmm7
 
     ;ecx r2, edx r4, edi r5
@@ -1694,109 +1694,109 @@
     shl    r4,    3
     movd   xmm5,   r5d
     movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
+    mov    r5,    8
+    add    r5,    r2
     mov    r3,    arg4
-	SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r3, r3d
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    lea    r5,    [r5+2*r3]
 
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    lea    r5,    [r5+2*r3]
 
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    lea    r5,    [r5+2*r3]
 
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
     movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-	POP_XMM
+    paddw      xmm0, xmm7
+    movd       retrd,  xmm0
+    POP_XMM
 %ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
+    pop  r5
+    pop  r4
+    pop  r3
 %endif
-	jmp        .return
+    jmp        .return
 
 .pixel_sad_8x8_nsplit:
 
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 7
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
+    pop r2
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
+    SSE2_GetSad8x4
     lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
+    lea    r2,    [r2+2*r3]
     SSE2_GetSad8x4
     movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
+    paddw      xmm0, xmm6
+    movd       retrd,  xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
 .return:
-	ret
+    ret
 
 
 ;***********************************************************************
@@ -1814,624 +1814,624 @@
 
 
 %macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
+    psadbw %1,   %4
+    paddw  xmm5, %1
+    psadbw %4,   %3
+    paddw  xmm4, %4
+    movdqu %4,   [%5-1]
+    psadbw %4,   %2
+    paddw  xmm6, %4
+    movdqu %4,   [%5+1]
+    psadbw %4,   %2
+    paddw  xmm7, %4
 %endmacro
 WELS_EXTERN WelsSampleSadFour16x16_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movdqa xmm0,   [r0]
+    sub    r2,    r3
+    movdqu xmm3,   [r2]
+    psadbw xmm3,   xmm0
+    paddw  xmm4,   xmm3
 
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm3,   xmm1
+    paddw  xmm4,   xmm3
 
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
+    movdqu xmm2,   [r2+r3-1]
+    psadbw xmm2,   xmm0
+    paddw  xmm6,   xmm2
 
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
+    movdqu xmm3,   [r2+r3+1]
+    psadbw xmm3,   xmm0
+    paddw  xmm7,   xmm3
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm1,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+    movdqa xmm2,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm0,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm1,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+    movdqa xmm2,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm0,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r2,    [r2+2*r3]
+    movdqu xmm3,   [r2]
+    psadbw xmm2,   xmm3
+    paddw xmm5,   xmm2
 
-	movdqu xmm2,   [r2-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
+    movdqu xmm2,   [r2-1]
+    psadbw xmm2,   xmm0
+    paddw xmm6,   xmm2
 
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
+    movdqu xmm3,   [r2+1]
+    psadbw xmm3,   xmm0
+    paddw xmm7,   xmm3
 
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm0,   xmm3
+    paddw xmm5,   xmm0
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 
 WELS_EXTERN WelsSampleSadFour16x8_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movdqa xmm0,   [r0]
+    sub    r2,    r3
+    movdqu xmm3,   [r2]
+    psadbw xmm3,   xmm0
+    paddw xmm4,   xmm3
 
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm3,   xmm1
+    paddw xmm4,   xmm3
 
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
+    movdqu xmm2,   [r2+r3-1]
+    psadbw xmm2,   xmm0
+    paddw xmm6,   xmm2
 
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
+    movdqu xmm3,   [r2+r3+1]
+    psadbw xmm3,   xmm0
+    paddw xmm7,   xmm3
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm1,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+    movdqa xmm2,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm0,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+    lea    r2,    [r2+2*r3]
+    movdqu xmm3,   [r2]
+    psadbw xmm0,   xmm3
+    paddw xmm5,   xmm0
 
-	movdqu xmm0,   [r2-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
+    movdqu xmm0,   [r2-1]
+    psadbw xmm0,   xmm1
+    paddw xmm6,   xmm0
 
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
+    movdqu xmm3,   [r2+1]
+    psadbw xmm3,   xmm1
+    paddw xmm7,   xmm3
 
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm1,   xmm3
+    paddw xmm5,   xmm1
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 WELS_EXTERN WelsSampleSadFour8x16_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movq   xmm0,   [r0]
+    movhps xmm0,   [r0+r1]
+    sub    r2,    r3
+    movq   xmm3,   [r2]
+    movhps xmm3,   [r2+r3]
+    psadbw xmm3,   xmm0
+    paddw  xmm4,   xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 
 WELS_EXTERN WelsSampleSadFour8x8_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movq   xmm0,   [r0]
+    movhps xmm0,   [r0+r1]
+    sub    r2,    r3
+    movq   xmm3,   [r2]
+    movhps xmm3,   [r2+r3]
+    psadbw xmm3,   xmm0
+    paddw  xmm4,   xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 WELS_EXTERN WelsSampleSadFour4x4_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movd   xmm0,   [r0]
-	movd   xmm1,   [r0+r1]
-	lea        r0,    [r0+2*r1]
-	movd       xmm2,   [r0]
-	movd       xmm3,   [r0+r1]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        r2,  r3
-	movd       xmm1, [r2]
-	movd       xmm2, [r2+r3]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [r2+r3-1]
-	movd       xmm3, [r2+r3+1]
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movd   xmm0,   [r0]
+    movd   xmm1,   [r0+r1]
+    lea        r0,    [r0+2*r1]
+    movd       xmm2,   [r0]
+    movd       xmm3,   [r0+r1]
+    punpckldq  xmm0, xmm1
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    sub        r2,  r3
+    movd       xmm1, [r2]
+    movd       xmm2, [r2+r3]
+    punpckldq  xmm1, xmm2
+    movd       xmm2, [r2+r3-1]
+    movd       xmm3, [r2+r3+1]
 
-	lea        r2,  [r2+2*r3]
+    lea        r2,  [r2+2*r3]
 
-	movd       xmm4, [r2]
-	movd       xmm5, [r2-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [r2+1]
-	punpckldq  xmm3, xmm5
+    movd       xmm4, [r2]
+    movd       xmm5, [r2-1]
+    punpckldq  xmm2, xmm5
+    movd       xmm5, [r2+1]
+    punpckldq  xmm3, xmm5
 
-	movd       xmm5, [r2+r3]
-	punpckldq  xmm4, xmm5
+    movd       xmm5, [r2+r3]
+    punpckldq  xmm4, xmm5
 
-	punpcklqdq xmm1, xmm4 ;-L
+    punpcklqdq xmm1, xmm4 ;-L
 
-	movd       xmm5, [r2+r3-1]
-	movd       xmm6, [r2+r3+1]
+    movd       xmm5, [r2+r3-1]
+    movd       xmm6, [r2+r3+1]
 
-	lea        r2,  [r2+2*r3]
-	movd       xmm7, [r2-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [r2+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [r2]
-	movd       xmm7, [r2+r3]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
+    lea        r2,  [r2+2*r3]
+    movd       xmm7, [r2-1]
+    punpckldq  xmm5, xmm7
+    punpcklqdq xmm2, xmm5 ;-1
+    movd       xmm7, [r2+1]
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm3, xmm6 ;+1
+    movd       xmm6, [r2]
+    movd       xmm7, [r2+r3]
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6 ;+L
+    psadbw     xmm1, xmm0
+    psadbw     xmm2, xmm0
+    psadbw     xmm3, xmm0
+    psadbw     xmm4, xmm0
 
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [r4],xmm1
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm1
+    paddw      xmm1, xmm0
+    movhlps    xmm0, xmm2
+    paddw      xmm2, xmm0
+    movhlps    xmm0, xmm3
+    paddw      xmm3, xmm0
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    punpckldq  xmm1, xmm4
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm1, xmm2
+    movdqa     [r4],xmm1
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -2444,33 +2444,33 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad4x4_mmx
     %assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movd	  mm0, [r0]
-	movd	  mm1, [r0+r1]
-	punpckldq mm0, mm1
+    LOAD_4_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movd      mm0, [r0]
+    movd      mm1, [r0+r1]
+    punpckldq mm0, mm1
 
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
+    movd      mm3, [r2]
+    movd      mm4, [r2+r3]
+    punpckldq mm3, mm4
+    psadbw    mm0, mm3
 
-	lea       r0, [r0+2*r1]
-	lea       r2, [r2+2*r3]
+    lea       r0, [r0+2*r1]
+    lea       r2, [r2+2*r3]
 
-	movd      mm1, [r0]
-	movd      mm2, [r0+r1]
-	punpckldq mm1, mm2
+    movd      mm1, [r0]
+    movd      mm2, [r0+r1]
+    punpckldq mm1, mm2
 
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
+    movd      mm3, [r2]
+    movd      mm4, [r2+r3]
+    punpckldq mm3, mm4
+    psadbw    mm1, mm3
+    paddw     mm0, mm1
 
     movd      retrd, mm0
 
-	WELSEMMS
+    WELSEMMS
     LOAD_4_PARA_POP
     ret
--- a/codec/common/x86/vaa.asm
+++ b/codec/common/x86/vaa.asm
@@ -29,16 +29,16 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	vaa.asm
+;*  vaa.asm
 ;*
-;*	Abstract
+;*  Abstract
 ;*      sse2 for pVaa routines
 ;*
 ;*  History
-;*      04/14/2010	Created
-;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
-;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*      04/14/2010  Created
+;*      06/07/2010  Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*      06/10/2010  Tune rc_sad_frame_sse2 and got about 40% improvement
+;*      08/11/2010  Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -49,87 +49,87 @@
 ;***********************************************************************
 
 ; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
+%macro SUM_WORD_8x2_SSE2    2   ; dst(pSrc), tmp
+    ; @sum_8x2 begin
+    pshufd %2, %1, 04Eh ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 04Eh    ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 0B1h    ; 10110001 B
+    paddw %1, %2
+    ; end of @sum_8x2
+%endmacro   ; END of SUM_WORD_8x2_SSE2
 
 
 %macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [r0    ]	; line 0
-	movdqa %2, [r0+r1]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [r0+r2]	; line 2
-	movdqa %4, [r0+r3]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $04
+    movdqa %1, [r0    ] ; line 0
+    movdqa %2, [r0+r1]  ; line 1
+    movdqa %3, %1
+    punpcklbw %1, xmm7
+    punpckhbw %3, xmm7
+    movdqa %4, %2
+    punpcklbw %4, xmm7
+    punpckhbw %2, xmm7
+    paddw %1, %4
+    paddw %2, %3
+    movdqa %3, [r0+r2]  ; line 2
+    movdqa %4, [r0+r3]  ; line 3
+    movdqa %5, %3
+    punpcklbw %3, xmm7
+    punpckhbw %5, xmm7
+    movdqa %6, %4
+    punpcklbw %6, xmm7
+    punpckhbw %4, xmm7
+    paddw %3, %6
+    paddw %4, %5
+    paddw %1, %3    ; block 0, 1
+    paddw %2, %4    ; block 2, 3
+    pshufd %3, %1, 0B1h
+    pshufd %4, %2, 0B1h
+    paddw %1, %3
+    paddw %2, %4
+    movdqa %3, %1
+    movdqa %4, %2
+    pshuflw %5, %1, 0B1h
+    pshufhw %6, %3, 0B1h
+    paddw %1, %5
+    paddw %3, %6
+    pshuflw %5, %2, 0B1h
+    pshufhw %6, %4, 0B1h
+    paddw %2, %5
+    paddw %4, %6
+    punpcklwd %1, %2
+    punpckhwd %3, %4
+    punpcklwd %1, %3
+    psraw %1, $04
 %endmacro
 
 %macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [r0    ]	; line 0
-	movdqa %2, [r0+r1]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [r0+r2]	; line 2
-	movdqa %4, [r0+r3]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $04
+    movdqa %1, [r0    ] ; line 0
+    movdqa %2, [r0+r1]  ; line 1
+    movdqa %3, %1
+    punpcklbw %1, xmm7
+    punpckhbw %3, xmm7
+    movdqa %4, %2
+    punpcklbw %4, xmm7
+    punpckhbw %2, xmm7
+    paddw %1, %4
+    paddw %2, %3
+    movdqa %3, [r0+r2]  ; line 2
+    movdqa %4, [r0+r3]  ; line 3
+    movdqa %5, %3
+    punpcklbw %3, xmm7
+    punpckhbw %5, xmm7
+    movdqa %6, %4
+    punpcklbw %6, xmm7
+    punpckhbw %4, xmm7
+    paddw %3, %6
+    paddw %4, %5
+    paddw %1, %3    ; block 0, 1
+    paddw %2, %4    ; block 2, 3
+    phaddw %1, %2   ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+    phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+    psraw %1, $04
 %endmacro
 
 
@@ -143,7 +143,7 @@
 ; , 6/7/2010
 
 ;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
+;   int32_t AnalysisVaaInfoIntra_sse2(  uint8_t *pDataY, const int32_t iLineSize );
 ;***********************************************************************
 WELS_EXTERN AnalysisVaaInfoIntra_sse2
 
@@ -174,71 +174,71 @@
     mov r4,r2
     sal r4,$01   ;r4 = 4*iLineSize
 
-	pxor xmm7, xmm7
+    pxor xmm7, xmm7
 
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7], xmm0
+    ; loops
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7], xmm0
 
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+8], xmm0
+    lea r0, [r0+r4]
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+8], xmm0
 
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+16], xmm0
+    lea r0, [r0+r4]
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+16], xmm0
 
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+24], xmm0
+    lea r0, [r0+r4]
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+24], xmm0
 
-	movdqa xmm0, [r7]		; block 0~7
-	movdqa xmm1, [r7+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
+    movdqa xmm0, [r7]       ; block 0~7
+    movdqa xmm1, [r7+16]    ; block 8~15
+    movdqa xmm2, xmm0
+    paddw xmm0, xmm1
+    SUM_WORD_8x2_SSE2 xmm0, xmm3
 
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
+    pmullw xmm1, xmm1
+    pmullw xmm2, xmm2
+    movdqa xmm3, xmm1
+    movdqa xmm4, xmm2
+    punpcklwd xmm1, xmm7
+    punpckhwd xmm3, xmm7
+    punpcklwd xmm2, xmm7
+    punpckhwd xmm4, xmm7
+    paddd xmm1, xmm2
+    paddd xmm3, xmm4
+    paddd xmm1, xmm3
+    pshufd xmm2, xmm1, 01Bh
+    paddd xmm1, xmm2
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
 
 
 
-	movd r2d, xmm0
-	and r2, 0ffffh		; effective low work truncated
-	mov r3, r2
-	imul r2, r3
-	sar r2, $04
-	movd retrd, xmm1
-	sub retrd, r2d
+    movd r2d, xmm0
+    and r2, 0ffffh      ; effective low work truncated
+    mov r3, r2
+    imul r2, r3
+    sar r2, $04
+    movd retrd, xmm1
+    sub retrd, r2d
 
-	add r7,32
-	add r7,r5
+    add r7,32
+    add r7,r5
 
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	POP_XMM
+    POP_XMM
 
-	ret
+    ret
 
 ;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
+;   int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
 ;***********************************************************************
 WELS_EXTERN AnalysisVaaInfoIntra_ssse3
 
@@ -269,47 +269,47 @@
     mov r4,r2
     sal r4,$01   ;r4 = 4*iLineSize
 
-	pxor xmm7, xmm7
+    pxor xmm7, xmm7
 
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    ; loops
+    VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7],xmm0
 
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    lea r0,[r0+r4]
+    VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+8],xmm1
 
 
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    lea r0,[r0+r4]
+    VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7+16],xmm0
 
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    lea r0,[r0+r4]
+    VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+24],xmm1
 
 
-	movdqa xmm0,[r7]
-	movdqa xmm1,[r7+16]
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+    movdqa xmm0,[r7]
+    movdqa xmm1,[r7+16]
+    movdqa xmm2, xmm0
+    paddw xmm0, xmm1
+    SUM_WORD_8x2_SSE2 xmm0, xmm3    ; better performance than that of phaddw sets
 
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
+    pmullw xmm1, xmm1
+    pmullw xmm2, xmm2
+    movdqa xmm3, xmm1
+    movdqa xmm4, xmm2
+    punpcklwd xmm1, xmm7
+    punpckhwd xmm3, xmm7
+    punpcklwd xmm2, xmm7
+    punpckhwd xmm4, xmm7
+    paddd xmm1, xmm2
+    paddd xmm3, xmm4
+    paddd xmm1, xmm3
+    pshufd xmm2, xmm1, 01Bh
+    paddd xmm1, xmm2
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
 
 
     movd r2d, xmm0
@@ -318,94 +318,94 @@
     imul r2, r3
     sar r2, $04
     movd retrd, xmm1
-	sub retrd, r2d
+    sub retrd, r2d
 
-	add r7,32
-	add r7,r5
+    add r7,32
+    add r7,r5
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	POP_XMM
+    POP_XMM
 
-	ret
+    ret
 
 ;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;   uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
 ;***********************************************************************
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa xmm0,[r0]
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
-	pshufd xmm4, xmm3, 01Bh
-	paddd xmm4, xmm3
-	pshufd xmm3, xmm4, 0B1h
-	paddd xmm3, xmm4
-	movd r0d, xmm3
-	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa xmm0,[r0]
+    pshufd xmm1, xmm0, 01Bh
+    paddd xmm1, xmm0
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
+    psrad xmm1, 02h     ; iAverageSad
+    movdqa xmm2, xmm1
+    psrad xmm2, 06h
+    movdqa xmm3, xmm0   ; iSadBlock
+    psrad xmm3, 06h
+    psubd xmm3, xmm2
+    pmulld xmm3, xmm3   ; [comment]: pmulld from SSE4.1 instruction sets
+    pshufd xmm4, xmm3, 01Bh
+    paddd xmm4, xmm3
+    pshufd xmm3, xmm4, 0B1h
+    paddd xmm3, xmm4
+    movd r0d, xmm3
+    cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
 
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 01Bh
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps retrd, xmm0
-	ret
+    jb near .threshold_exit
+    pshufd xmm0, xmm0, 01Bh
+    pcmpgtd xmm0, xmm1  ; iSadBlock > iAverageSad
+    movmskps retrd, xmm0
+    ret
 .threshold_exit:
-	mov retrd, 15
-	ret
+    mov retrd, 15
+    ret
 
 ;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;   uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
 ;***********************************************************************
 WELS_EXTERN MdInterAnalysisVaaInfo_sse2
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa xmm0, [r0]
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa xmm0, [r0]
+    pshufd xmm1, xmm0, 01Bh
+    paddd xmm1, xmm0
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
+    psrad xmm1, 02h     ; iAverageSad
+    movdqa xmm2, xmm1
+    psrad xmm2, 06h
+    movdqa xmm3, xmm0   ; iSadBlock
+    psrad xmm3, 06h
+    psubd xmm3, xmm2
 
-	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3
-	pmuludq xmm2, xmm3
-	pshufd xmm4, xmm3, 0B1h
-	pmuludq xmm4, xmm4
-	movdqa xmm5, xmm2
-	punpckldq xmm5, xmm4
-	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2
+    ; to replace pmulld functionality as below
+    movdqa xmm2, xmm3
+    pmuludq xmm2, xmm3
+    pshufd xmm4, xmm3, 0B1h
+    pmuludq xmm4, xmm4
+    movdqa xmm5, xmm2
+    punpckldq xmm5, xmm4
+    punpckhdq xmm2, xmm4
+    punpcklqdq xmm5, xmm2
 
-	pshufd xmm4, xmm5, 01Bh
-	paddd xmm4, xmm5
-	pshufd xmm5, xmm4, 0B1h
-	paddd xmm5, xmm4
+    pshufd xmm4, xmm5, 01Bh
+    paddd xmm4, xmm5
+    pshufd xmm5, xmm4, 0B1h
+    paddd xmm5, xmm4
 
-	movd r0d, xmm5
-	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 01Bh
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps retrd, xmm0
-	ret
+    movd r0d, xmm5
+    cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+    jb near .threshold_exit
+    pshufd xmm0, xmm0, 01Bh
+    pcmpgtd xmm0, xmm1  ; iSadBlock > iAverageSad
+    movmskps retrd, xmm0
+    ret
 .threshold_exit:
-	mov retrd, 15
-	ret
+    mov retrd, 15
+    ret
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -36,67 +36,67 @@
 #ifdef __APPLE__
 
 .macro ROW_TRANSFORM_1_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		$8, $1, #1
-    vshr.s16		$9, $3, #1
-    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        $8, $1, #1
+    vshr.s16        $9, $3, #1
+    vsubl.s16       $6, $8, $3          //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       $7, $1, $9          //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro TRANSFORM_4BYTES	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 .macro COL_TRANSFORM_1_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		$6, $1, #1
-    vshr.s32		$7, $3, #1
-    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        $6, $1, #1
+    vshr.s32        $7, $3, #1
+    vsub.s32        $6, $6, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        $7, $1, $7          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 
 #else
 
 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		\arg8, \arg1, #1
-    vshr.s16		\arg9, \arg3, #1
-    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        \arg8, \arg1, #1
+    vshr.s16        \arg9, \arg3, #1
+    vsubl.s16       \arg6, \arg8, \arg3         //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       \arg7, \arg1, \arg9         //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
 .macro TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 .macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		\arg6, \arg1, #1
-    vshr.s32		\arg7, \arg3, #1
-    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        \arg6, \arg1, #1
+    vshr.s32        \arg7, \arg3, #1
+    vsub.s32        \arg6, \arg6, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 #endif
 // r0    int16_t* block,
@@ -103,61 +103,61 @@
 // r1    int8_t* non_zero_count,
 WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
 
-	vld1.64	{d0-d2}, [r1]
+    vld1.64 {d0-d2}, [r1]
 
-	vceq.s8	q0, q0, #0
-	vceq.s8	d2, d2, #0
-	vmvn	q0, q0
-	vmvn	d2, d2
-	vabs.s8	q0, q0
-	vabs.s8	d2, d2
+    vceq.s8 q0, q0, #0
+    vceq.s8 d2, d2, #0
+    vmvn    q0, q0
+    vmvn    d2, d2
+    vabs.s8 q0, q0
+    vabs.s8 d2, d2
 
-	vst1.64	{d0-d2}, [r1]
+    vst1.64 {d0-d2}, [r1]
 WELS_ASM_FUNC_END
 
 
-//	uint8_t *pred, const int32_t stride, int16_t *rs
+//  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
 
-	vld4.s16		{d0, d1, d2, d3}, [r2]		// cost 3 cycles!
+    vld4.s16        {d0, d1, d2, d3}, [r2]      // cost 3 cycles!
 
-	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
+    ROW_TRANSFORM_1_STEP        d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
 
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_4BYTES        q0, q1, q2, q3, q8, q9, q10, q11
 
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+    // transform element 32bits
+    vtrn.s32        q0, q1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s32        q2, q3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vswp            d1, d4              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vswp            d3, d6              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
 
-	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q8, q9, q10, q11
+    COL_TRANSFORM_1_STEP        q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_4BYTES        q0, q1, q2, q3, q8, q9, q10, q11
 
-	//after clip_table[MAX_NEG_CROP] into [0, 255]
-	mov			r2, r0
-	vld1.32		{d20[0]},[r0],r1
-	vld1.32		{d20[1]},[r0],r1
-	vld1.32		{d22[0]},[r0],r1
-	vld1.32		{d22[1]},[r0]
+    //after clip_table[MAX_NEG_CROP] into [0, 255]
+    mov         r2, r0
+    vld1.32     {d20[0]},[r0],r1
+    vld1.32     {d20[1]},[r0],r1
+    vld1.32     {d22[0]},[r0],r1
+    vld1.32     {d22[1]},[r0]
 
-	vrshrn.s32		d16, q0, #6
-	vrshrn.s32		d17, q1, #6
-	vrshrn.s32		d18, q2, #6
-	vrshrn.s32		d19, q3, #6
+    vrshrn.s32      d16, q0, #6
+    vrshrn.s32      d17, q1, #6
+    vrshrn.s32      d18, q2, #6
+    vrshrn.s32      d19, q3, #6
 
-	vmovl.u8		q0,d20
-	vmovl.u8		q1,d22
-	vadd.s16		q0,q8
-	vadd.s16		q1,q9
+    vmovl.u8        q0,d20
+    vmovl.u8        q1,d22
+    vadd.s16        q0,q8
+    vadd.s16        q1,q9
 
-	vqmovun.s16		d20,q0
-	vqmovun.s16		d22,q1
+    vqmovun.s16     d20,q0
+    vqmovun.s16     d22,q1
 
-	vst1.32		{d20[0]},[r2],r1
-	vst1.32		{d20[1]},[r2],r1
-	vst1.32		{d22[0]},[r2],r1
-	vst1.32		{d22[1]},[r2]
+    vst1.32     {d20[0]},[r2],r1
+    vst1.32     {d20[1]},[r2],r1
+    vst1.32     {d22[0]},[r2],r1
+    vst1.32     {d22[1]},[r2]
 WELS_ASM_FUNC_END
 #endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -38,45 +38,45 @@
 #ifdef __APPLE__
 //Global macro
 .macro GET_8BYTE_DATA
-	vld1.8 {$0[0]}, [$1], $2
-	vld1.8 {$0[1]}, [$1], $2
-	vld1.8 {$0[2]}, [$1], $2
-	vld1.8 {$0[3]}, [$1], $2
-	vld1.8 {$0[4]}, [$1], $2
-	vld1.8 {$0[5]}, [$1], $2
-	vld1.8 {$0[6]}, [$1], $2
-	vld1.8 {$0[7]}, [$1], $2
+    vld1.8 {$0[0]}, [$1], $2
+    vld1.8 {$0[1]}, [$1], $2
+    vld1.8 {$0[2]}, [$1], $2
+    vld1.8 {$0[3]}, [$1], $2
+    vld1.8 {$0[4]}, [$1], $2
+    vld1.8 {$0[5]}, [$1], $2
+    vld1.8 {$0[6]}, [$1], $2
+    vld1.8 {$0[7]}, [$1], $2
 .endmacro
 #else
 //Global macro
 .macro GET_8BYTE_DATA arg0, arg1, arg2
-	vld1.8 {\arg0[0]}, [\arg1], \arg2
-	vld1.8 {\arg0[1]}, [\arg1], \arg2
-	vld1.8 {\arg0[2]}, [\arg1], \arg2
-	vld1.8 {\arg0[3]}, [\arg1], \arg2
-	vld1.8 {\arg0[4]}, [\arg1], \arg2
-	vld1.8 {\arg0[5]}, [\arg1], \arg2
-	vld1.8 {\arg0[6]}, [\arg1], \arg2
-	vld1.8 {\arg0[7]}, [\arg1], \arg2
+    vld1.8 {\arg0[0]}, [\arg1], \arg2
+    vld1.8 {\arg0[1]}, [\arg1], \arg2
+    vld1.8 {\arg0[2]}, [\arg1], \arg2
+    vld1.8 {\arg0[3]}, [\arg1], \arg2
+    vld1.8 {\arg0[4]}, [\arg1], \arg2
+    vld1.8 {\arg0[5]}, [\arg1], \arg2
+    vld1.8 {\arg0[6]}, [\arg1], \arg2
+    vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
-	//Get the top line data to 'q0'
-	sub  r2, r0, r1
-	vldm r2, {d0, d1}
+    //Get the top line data to 'q0'
+    sub  r2, r0, r1
+    vldm r2, {d0, d1}
 
-	mov  r2, r0
-	mov  r3, #4
-	//Set the top line to the each line of MB(16*16)
+    mov  r2, r0
+    mov  r3, #4
+    //Set the top line to the each line of MB(16*16)
 loop_0_get_i16x16_luma_pred_v:
-	vst1.8 {d0,d1}, [r2], r1
-	vst1.8 {d0,d1}, [r2], r1
-	vst1.8 {d0,d1}, [r2], r1
-	vst1.8 {d0,d1}, [r2], r1
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_v
+    vst1.8 {d0,d1}, [r2], r1
+    vst1.8 {d0,d1}, [r2], r1
+    vst1.8 {d0,d1}, [r2], r1
+    vst1.8 {d0,d1}, [r2], r1
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_v
 
 WELS_ASM_FUNC_END
 
@@ -83,59 +83,59 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
-	sub  r2, r0, #1
-	mov  r3, #4
+    sub  r2, r0, #1
+    mov  r3, #4
 loop_0_get_i16x16_luma_pred_h:
-	//Get one byte data from left side
-	vld1.8 {d0[],d1[]}, [r2], r1
-	vld1.8 {d2[],d3[]}, [r2], r1
-	vld1.8 {d4[],d5[]}, [r2], r1
-	vld1.8 {d6[],d7[]}, [r2], r1
+    //Get one byte data from left side
+    vld1.8 {d0[],d1[]}, [r2], r1
+    vld1.8 {d2[],d3[]}, [r2], r1
+    vld1.8 {d4[],d5[]}, [r2], r1
+    vld1.8 {d6[],d7[]}, [r2], r1
 
-	//Set the line of MB using the left side byte data
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d2,d3}, [r0], r1
-	vst1.8 {d4,d5}, [r0], r1
-	vst1.8 {d6,d7}, [r0], r1
+    //Set the line of MB using the left side byte data
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d2,d3}, [r0], r1
+    vst1.8 {d4,d5}, [r0], r1
+    vst1.8 {d6,d7}, [r0], r1
 
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_h
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_h
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the left vertical line data
-	sub r2, r0, #1
-	GET_8BYTE_DATA d0, r2, r1
-	GET_8BYTE_DATA d1, r2, r1
+    //stmdb sp!, { r2-r5, lr}
+    //Get the left vertical line data
+    sub r2, r0, #1
+    GET_8BYTE_DATA d0, r2, r1
+    GET_8BYTE_DATA d1, r2, r1
 
-	//Get the top horizontal line data
-	sub  r2, r0, r1
-	vldm r2, {d2, d3}
+    //Get the top horizontal line data
+    sub  r2, r0, r1
+    vldm r2, {d2, d3}
 
-	//Calculate the sum of top horizontal line data and vertical line data
-	vpaddl.u8 q0, q0
-	vpaddl.u8 q1, q1
-	vadd.u16  q0, q0, q1
-	vadd.u16  d0, d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the sum of top horizontal line data and vertical line data
+    vpaddl.u8 q0, q0
+    vpaddl.u8 q1, q1
+    vadd.u16  q0, q0, q1
+    vadd.u16  d0, d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, d0, #5
-	vdup.8     q0, d0[0]
+    //Calculate the mean value
+    vrshr.u16  d0, d0, #5
+    vdup.8     q0, d0[0]
 
-	//Set the mean value to the all of member of MB
-	mov  r2, #4
+    //Set the mean value to the all of member of MB
+    mov  r2, #4
 loop_0_get_i16x16_luma_pred_dc_both:
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d0,d1}, [r0], r1
-	subs  r2, #1
-	bne  loop_0_get_i16x16_luma_pred_dc_both
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d0,d1}, [r0], r1
+    subs  r2, #1
+    bne  loop_0_get_i16x16_luma_pred_dc_both
 
 WELS_ASM_FUNC_END
 
@@ -149,106 +149,106 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
-	//stmdb sp!, { r2-r5, lr}
+    //stmdb sp!, { r2-r5, lr}
 
-	//Load the table {(8,7,6,5,4,3,2,1) * 5}
-	adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
-	vldr    d0, [r2]
+    //Load the table {(8,7,6,5,4,3,2,1) * 5}
+    adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
+    vldr    d0, [r2]
 
-	//Pack the top[-1] ~ top[6] to d1
-	sub       r2,  r0, r1
-	sub       r3,  r2, #1
-	vld1.8    d1, [r3]
+    //Pack the top[-1] ~ top[6] to d1
+    sub       r2,  r0, r1
+    sub       r3,  r2, #1
+    vld1.8    d1, [r3]
 
-	//Pack the top[8] ~ top[15] to d2
-	add       r3, #9
-	vld1.8    d2, [r3]
+    //Pack the top[8] ~ top[15] to d2
+    add       r3, #9
+    vld1.8    d2, [r3]
 
-	//Save the top[15] to d6 for next step
-	vdup.u8   d6,   d2[7]
+    //Save the top[15] to d6 for next step
+    vdup.u8   d6,   d2[7]
 
-	//Get and pack left[-1] ~ left[6] to d4
-	sub       r3,  r2, #1
-	GET_8BYTE_DATA d4, r3, r1
+    //Get and pack left[-1] ~ left[6] to d4
+    sub       r3,  r2, #1
+    GET_8BYTE_DATA d4, r3, r1
 
-	//Get and pack left[8] ~ left[15] to d3
-	add       r3,  r1
-	GET_8BYTE_DATA d3, r3, r1
+    //Get and pack left[8] ~ left[15] to d3
+    add       r3,  r1
+    GET_8BYTE_DATA d3, r3, r1
 
-	//Save the left[15] to d7 for next step
-	vdup.u8   d7,   d3[7]
+    //Save the left[15] to d7 for next step
+    vdup.u8   d7,   d3[7]
 
-	//revert the sequence of d2,d3
-	vrev64.8   q1, q1
+    //revert the sequence of d2,d3
+    vrev64.8   q1, q1
 
-	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
-	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+    vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+    vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
 
 
-	vmovl.u8   q0, d0
-	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
-	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+    vmovl.u8   q0, d0
+    vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+    vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
 
-	//Calculate the sum of items of q1, q2
-	vpadd.s16  d0, d2, d3
-	vpadd.s16  d1, d4, d5
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
+    //Calculate the sum of items of q1, q2
+    vpadd.s16  d0, d2, d3
+    vpadd.s16  d1, d4, d5
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
 
-	//Get the value of 'b', 'c' and extend to q1, q2.
-	vrshr.s64  q0, #6
-	vdup.s16   q1, d0[0]
-	vdup.s16   q2, d1[0]
+    //Get the value of 'b', 'c' and extend to q1, q2.
+    vrshr.s64  q0, #6
+    vdup.s16   q1, d0[0]
+    vdup.s16   q2, d1[0]
 
-	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
-	adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
-	vld1.32   {d0}, [r2]
+    //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+    adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
+    vld1.32   {d0}, [r2]
 
-	//Get the value of 'a' and save to q3
-	vaddl.u8  q3, d6, d7
-	vshl.u16  q3, #4
+    //Get the value of 'a' and save to q3
+    vaddl.u8  q3, d6, d7
+    vshl.u16  q3, #4
 
-	//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
-	vmovl.s8  q0, d0
-	vmla.s16  q3, q0, q1
-	vmla.s16  q3, q2, d0[0]
+    //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+    vmovl.s8  q0, d0
+    vmla.s16  q3, q0, q1
+    vmla.s16  q3, q2, d0[0]
 
-	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
-	vshl.s16  q8, q1, #3
-	vadd.s16  q8, q3
+    //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+    vshl.s16  q8, q1, #3
+    vadd.s16  q8, q3
 
-	//right shift 5 bits and rounding
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
+    //right shift 5 bits and rounding
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
 
-	//Set the line of MB
-	vst1.u32  {d0,d1}, [r0], r1
+    //Set the line of MB
+    vst1.u32  {d0,d1}, [r0], r1
 
 
-	//Do the same processing for setting other lines
-	mov  r2, #15
+    //Do the same processing for setting other lines
+    mov  r2, #15
 loop_0_get_i16x16_luma_pred_plane:
-	vadd.s16  q3, q2
-	vadd.s16  q8, q2
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
-	vst1.u32  {d0,d1}, [r0], r1
-	subs  r2, #1
-	bne  loop_0_get_i16x16_luma_pred_plane
+    vadd.s16  q3, q2
+    vadd.s16  q8, q2
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
+    vst1.u32  {d0,d1}, [r0], r1
+    subs  r2, #1
+    bne  loop_0_get_i16x16_luma_pred_plane
 
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub  r2, r0, r1
-	ldr  r2, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub  r2, r0, r1
+    ldr  r2, [r2]
 
-	//Set the luma MB using top line
-	str  r2, [r0], r1
-	str  r2, [r0], r1
-	str  r2, [r0], r1
-	str  r2, [r0]
+    //Set the luma MB using top line
+    str  r2, [r0], r1
+    str  r2, [r0], r1
+    str  r2, [r0], r1
+    str  r2, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -255,97 +255,97 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the left column (4 bytes)
-	sub  r2, r0, #1
-	vld1.8 {d0[]}, [r2], r1
-	vld1.8 {d1[]}, [r2], r1
-	vld1.8 {d2[]}, [r2], r1
-	vld1.8 {d3[]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column (4 bytes)
+    sub  r2, r0, #1
+    vld1.8 {d0[]}, [r2], r1
+    vld1.8 {d1[]}, [r2], r1
+    vld1.8 {d2[]}, [r2], r1
+    vld1.8 {d3[]}, [r2]
 
-	//Set the luma MB using the left side byte
-	vst1.32 {d0[0]}, [r0], r1
-	vst1.32 {d1[0]}, [r0], r1
-	vst1.32 {d2[0]}, [r0], r1
-	vst1.32 {d3[0]}, [r0]
+    //Set the luma MB using the left side byte
+    vst1.32 {d0[0]}, [r0], r1
+    vst1.32 {d1[0]}, [r0], r1
+    vst1.32 {d2[0]}, [r0], r1
+    vst1.32 {d3[0]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data(8 bytes)
-	sub    r2,  r0, r1
-	vld1.32  {d0}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data(8 bytes)
+    sub    r2,  r0, r1
+    vld1.32  {d0}, [r2]
 
-	//For "t7 + (t7<<1)"
-	vdup.8   d1,  d0[7]
+    //For "t7 + (t7<<1)"
+    vdup.8   d1,  d0[7]
 
-	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
-	vext.8   d1,  d0, d1, #1
-	vaddl.u8 q1,  d1, d0
+    //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+    vext.8   d1,  d0, d1, #1
+    vaddl.u8 q1,  d1, d0
 
-	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
-	vext.8   q2,  q1, q1, #14
-	vadd.u16 q0,  q1, q2
+    //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+    vext.8   q2,  q1, q1, #14
+    vadd.u16 q0,  q1, q2
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16  d0,  q0, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16  d0,  q0, #2
 
-	//Save "ddl0, ddl1, ddl2, ddl3"
-	vext.8   d1, d0, d0, #1
-	vst1.32  d1[0], [r0], r1
+    //Save "ddl0, ddl1, ddl2, ddl3"
+    vext.8   d1, d0, d0, #1
+    vst1.32  d1[0], [r0], r1
 
-	//Save "ddl1, ddl2, ddl3, ddl4"
-	vext.8   d1, d0, d0, #2
-	vst1.32  d1[0], [r0], r1
+    //Save "ddl1, ddl2, ddl3, ddl4"
+    vext.8   d1, d0, d0, #2
+    vst1.32  d1[0], [r0], r1
 
-	//Save "ddl2, ddl3, ddl4, ddl5"
-	vext.8   d1, d0, d0, #3
-	vst1.32  d1[0], [r0], r1
+    //Save "ddl2, ddl3, ddl4, ddl5"
+    vext.8   d1, d0, d0, #3
+    vst1.32  d1[0], [r0], r1
 
-	//Save "ddl3, ddl4, ddl5, ddl6"
-	vst1.32  d0[1], [r0]
+    //Save "ddl3, ddl4, ddl5, ddl6"
+    vst1.32  d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub    r2,  r0, r1
-	vld1.32  {d0[1]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub    r2,  r0, r1
+    vld1.32  {d0[1]}, [r2]
 
-	//Load the left column (5 bytes)
-	sub    r2,  #1
-	vld1.8 {d0[3]}, [r2], r1
-	vld1.8 {d0[2]}, [r2], r1
-	vld1.8 {d0[1]}, [r2], r1
-	vld1.8 {d0[0]}, [r2], r1
-	vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
+    //Load the left column (5 bytes)
+    sub    r2,  #1
+    vld1.8 {d0[3]}, [r2], r1
+    vld1.8 {d0[2]}, [r2], r1
+    vld1.8 {d0[1]}, [r2], r1
+    vld1.8 {d0[0]}, [r2], r1
+    vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
 
 
-	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
-	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+    vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+                              //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
 
-	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
-	vaddl.u8 q2, d2, d0
+    //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+    vaddl.u8 q2, d2, d0
 
-	//q1:{TL0+LT0,LT0+T01,...L12+L23}
-	vext.8   q3, q3, q2, #14
-	vadd.u16 q1, q2, q3
+    //q1:{TL0+LT0,LT0+T01,...L12+L23}
+    vext.8   q3, q3, q2, #14
+    vadd.u16 q1, q2, q3
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16 d0, q1, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16 d0, q1, #2
 
-	//Adjust the data sequence for setting luma MB of 'pred'
-	vst1.32   d0[1], [r0], r1
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0], r1
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0], r1
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]
+    //Adjust the data sequence for setting luma MB of 'pred'
+    vst1.32   d0[1], [r0], r1
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0], r1
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0], r1
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
@@ -352,31 +352,31 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (8 bytes)
-	sub    r2,  r0, r1
-	vld1.32  {d0}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (8 bytes)
+    sub    r2,  r0, r1
+    vld1.32  {d0}, [r2]
 
 
-	vext.8   d1,  d0, d0, #1
-	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+    vext.8   d1,  d0, d0, #1
+    vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
 
-	vext.8   q2,  q1, q1, #2
-	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+    vext.8   q2,  q1, q1, #2
+    vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
 
-	//calculate the "vl0,vl1,vl2,vl3,vl4"
-	vqrshrn.u16  d0,  q1, #1
+    //calculate the "vl0,vl1,vl2,vl3,vl4"
+    vqrshrn.u16  d0,  q1, #1
 
-	//calculate the "vl5,vl6,vl7,vl8,vl9"
-	vqrshrn.u16  d1,  q2, #2
+    //calculate the "vl5,vl6,vl7,vl8,vl9"
+    vqrshrn.u16  d1,  q2, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[0], [r0], r1
-	vst1.32  d1[0], [r0], r1
-	vext.8   d0,  d0, d0, #1
-	vext.8   d1,  d1, d1, #1
-	vst1.32  d0[0], [r0], r1
-	vst1.32  d1[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[0], [r0], r1
+    vst1.32  d1[0], [r0], r1
+    vext.8   d0,  d0, d0, #1
+    vext.8   d1,  d1, d1, #1
+    vst1.32  d0[0], [r0], r1
+    vst1.32  d1[0], [r0]
 
 WELS_ASM_FUNC_END
 
@@ -383,152 +383,152 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub       r2,  r0, r1
-	vld1.32   {d0[1]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub       r2,  r0, r1
+    vld1.32   {d0[1]}, [r2]
 
-	//Load the left column (4 bytes)
-	sub       r2,  #1
-	vld1.8    {d0[3]}, [r2], r1
-	vld1.8    {d0[2]}, [r2], r1
-	vld1.8    {d0[1]}, [r2], r1
-	vld1.8    {d0[0]}, [r2]
+    //Load the left column (4 bytes)
+    sub       r2,  #1
+    vld1.8    {d0[3]}, [r2], r1
+    vld1.8    {d0[2]}, [r2], r1
+    vld1.8    {d0[1]}, [r2], r1
+    vld1.8    {d0[0]}, [r2]
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
 
-	vext.u8   q2, q1, q1, #14
-	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+    vext.u8   q2, q1, q1, #14
+    vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
 
-	//Calculate the vr0 ~ vr9
-	vqrshrn.u16 d1, q2, #2
-	vqrshrn.u16 d0, q1, #1
+    //Calculate the vr0 ~ vr9
+    vqrshrn.u16 d1, q2, #2
+    vqrshrn.u16 d0, q1, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[1], [r0], r1
-	vst1.32  d1[1], [r0], r1
-	add    r2, r0, r1
-	vst1.8   d1[3], [r0]!
-	vst1.16  d0[2], [r0]!
-	vst1.8   d0[6], [r0]!
-	vst1.8   d1[2], [r2]!
-	vst1.16  d1[2], [r2]!
-	vst1.8   d1[6], [r2]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[1], [r0], r1
+    vst1.32  d1[1], [r0], r1
+    add    r2, r0, r1
+    vst1.8   d1[3], [r0]!
+    vst1.16  d0[2], [r0]!
+    vst1.8   d0[6], [r0]!
+    vst1.8   d1[2], [r2]!
+    vst1.16  d1[2], [r2]!
+    vst1.8   d1[6], [r2]
 WELS_ASM_FUNC_END
 
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the left column data
-	sub       r2,  r0, #1
-	mov       r3,  #3
-	mul       r3,  r1
-	add       r3,  r2
-	vld1.8    {d0[]},  [r3]
-	vld1.8    {d0[4]}, [r2], r1
-	vld1.8    {d0[5]}, [r2], r1
-	vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column data
+    sub       r2,  r0, #1
+    mov       r3,  #3
+    mul       r3,  r1
+    add       r3,  r2
+    vld1.8    {d0[]},  [r3]
+    vld1.8    {d0[4]}, [r2], r1
+    vld1.8    {d0[5]}, [r2], r1
+    vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
 
-	vext.8    d1, d0, d0, #1
-	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+    vext.8    d1, d0, d0, #1
+    vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
 
-	vext.u8   d2, d5, d4, #2
-	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+    vext.u8   d2, d5, d4, #2
+    vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
 
-	//Calculate the hu0 ~ hu5
-	vqrshrn.u16 d2, q2, #1
-	vqrshrn.u16 d1, q1, #2
+    //Calculate the hu0 ~ hu5
+    vqrshrn.u16 d2, q2, #1
+    vqrshrn.u16 d1, q1, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vzip.8   d2, d1
-	vst1.32  d1[0], [r0], r1
-	vext.8   d2, d1, d1, #2
-	vst1.32  d2[0], [r0], r1
-	vst1.32  d1[1], [r0], r1
-	vst1.32  d0[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vzip.8   d2, d1
+    vst1.32  d1[0], [r0], r1
+    vext.8   d2, d1, d1, #2
+    vst1.32  d2[0], [r0], r1
+    vst1.32  d1[1], [r0], r1
+    vst1.32  d0[0], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the data
-	sub       r2,  r0, r1
-	sub       r2,  #1
-	vld1.32   {d0[1]}, [r2], r1
-	vld1.8    {d0[3]}, [r2], r1
-	vld1.8    {d0[2]}, [r2], r1
-	vld1.8    {d0[1]}, [r2], r1
-	vld1.8    {d0[0]}, [r2]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+    //stmdb sp!, { r2-r5, lr}
+    //Load the data
+    sub       r2,  r0, r1
+    sub       r2,  #1
+    vld1.32   {d0[1]}, [r2], r1
+    vld1.8    {d0[3]}, [r2], r1
+    vld1.8    {d0[2]}, [r2], r1
+    vld1.8    {d0[1]}, [r2], r1
+    vld1.8    {d0[0]}, [r2]     //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
 
-	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
-	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+    vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+    vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
 
-	//Calculate the hd0~hd9
-	vqrshrn.u16 d1, q3, #2
-	vqrshrn.u16 d0, q2, #1
+    //Calculate the hd0~hd9
+    vqrshrn.u16 d1, q3, #2
+    vqrshrn.u16 d0, q2, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vmov      d3, d1
-	vtrn.8    d0, d1
-	vext.u8   d2, d1, d1, #6
-	vst2.16  {d2[3], d3[3]}, [r0], r1
-	vst2.16  {d0[2], d1[2]}, [r0], r1
-	vmov     d3, d0
-	vst2.16  {d2[2], d3[2]}, [r0], r1
-	vst2.16  {d0[1], d1[1]}, [r0]
+    //Adjust the data sequence for setting the luma MB
+    vmov      d3, d1
+    vtrn.8    d0, d1
+    vext.u8   d2, d1, d1, #6
+    vst2.16  {d2[3], d3[3]}, [r0], r1
+    vst2.16  {d0[2], d1[2]}, [r0], r1
+    vmov     d3, d0
+    vst2.16  {d2[2], d3[2]}, [r0], r1
+    vst2.16  {d0[1], d1[1]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the top row (8 byte)
-	sub  r2, r0, r1
-	vldr d0, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Get the top row (8 byte)
+    sub  r2, r0, r1
+    vldr d0, [r2]
 
-	//Set the chroma MB using top row data
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0]
+    //Set the chroma MB using top row data
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	////Get the left column (8 byte)
-	sub  r2, r0, #1
-	vld1.8 {d0[]}, [r2], r1
-	vld1.8 {d1[]}, [r2], r1
-	vld1.8 {d2[]}, [r2], r1
-	vld1.8 {d3[]}, [r2], r1
-	vld1.8 {d4[]}, [r2], r1
-	vld1.8 {d5[]}, [r2], r1
-	vld1.8 {d6[]}, [r2], r1
-	vld1.8 {d7[]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    ////Get the left column (8 byte)
+    sub  r2, r0, #1
+    vld1.8 {d0[]}, [r2], r1
+    vld1.8 {d1[]}, [r2], r1
+    vld1.8 {d2[]}, [r2], r1
+    vld1.8 {d3[]}, [r2], r1
+    vld1.8 {d4[]}, [r2], r1
+    vld1.8 {d5[]}, [r2], r1
+    vld1.8 {d6[]}, [r2], r1
+    vld1.8 {d7[]}, [r2]
 
-	//Set the chroma MB using left column data
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d1}, [r0], r1
-	vst1.8 {d2}, [r0], r1
-	vst1.8 {d3}, [r0], r1
-	vst1.8 {d4}, [r0], r1
-	vst1.8 {d5}, [r0], r1
-	vst1.8 {d6}, [r0], r1
-	vst1.8 {d7}, [r0]
+    //Set the chroma MB using left column data
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d1}, [r0], r1
+    vst1.8 {d2}, [r0], r1
+    vst1.8 {d3}, [r0], r1
+    vst1.8 {d4}, [r0], r1
+    vst1.8 {d5}, [r0], r1
+    vst1.8 {d6}, [r0], r1
+    vst1.8 {d7}, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -576,73 +576,73 @@
 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
 
 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data
-	sub  r2, r0, #1
-	sub  r2, r1
-	vld1.32 {d1[0]}, [r2]
-	add  r2, #5
-	vld1.32 {d0[0]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data
+    sub  r2, r0, #1
+    sub  r2, r1
+    vld1.32 {d1[0]}, [r2]
+    add  r2, #5
+    vld1.32 {d0[0]}, [r2]
 
-	//Load the left column data
-	sub  r2, #5
-	vld1.8 {d1[4]}, [r2], r1
-	vld1.8 {d1[5]}, [r2], r1
-	vld1.8 {d1[6]}, [r2], r1
-	vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
-	add  r2, r1
-	vld1.8 {d0[4]}, [r2], r1
-	vld1.8 {d0[5]}, [r2], r1
-	vld1.8 {d0[6]}, [r2], r1
-	vld1.8 {d0[7]}, [r2]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+    //Load the left column data
+    sub  r2, #5
+    vld1.8 {d1[4]}, [r2], r1
+    vld1.8 {d1[5]}, [r2], r1
+    vld1.8 {d1[6]}, [r2], r1
+    vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+    add  r2, r1
+    vld1.8 {d0[4]}, [r2], r1
+    vld1.8 {d0[5]}, [r2], r1
+    vld1.8 {d0[6]}, [r2], r1
+    vld1.8 {d0[7]}, [r2]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
 
 
-	//Save T7 to d3 for next step
-	vdup.u8   d3,   d0[3]
-	//Save L7 to d4 for next step
-	vdup.u8   d4,   d0[7]
+    //Save T7 to d3 for next step
+    vdup.u8   d3,   d0[3]
+    //Save L7 to d4 for next step
+    vdup.u8   d4,   d0[7]
 
-	//Calculate the value of 'a' and save to q2
-	vaddl.u8  q2, d3, d4
-	vshl.u16  q2, #4
+    //Calculate the value of 'a' and save to q2
+    vaddl.u8  q2, d3, d4
+    vshl.u16  q2, #4
 
-	//Load the table {{1,2,3,4,1,2,3,4}*17}
-	adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d2}, [r2]
+    //Load the table {{1,2,3,4,1,2,3,4}*17}
+    adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d2}, [r2]
 
-	//Calculate the 'b','c', and save to q0
-	vrev32.8  d1, d1
-	vsubl.u8  q0, d0, d1
-	vmovl.u8   q1, d2
-	vmul.s16   q0, q1
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
-	vrshr.s64  q0, #5
+    //Calculate the 'b','c', and save to q0
+    vrev32.8  d1, d1
+    vsubl.u8  q0, d0, d1
+    vmovl.u8   q1, d2
+    vmul.s16   q0, q1
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
+    vrshr.s64  q0, #5
 
-	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
-	adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d6, d7}, [r2]
+    //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+    adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d6, d7}, [r2]
 
-	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
-	vdup.s16   q1, d1[0]
-	vdup.s16   q0, d0[0]
+    //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+    vdup.s16   q1, d1[0]
+    vdup.s16   q0, d0[0]
 
-	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
-	vmla.s16   q2, q0, q3
-	vmla.s16   q2, q1, d6[0]
-	vqrshrun.s16 d0, q2, #5
+    //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+    vmla.s16   q2, q0, q3
+    vmla.s16   q2, q1, d6[0]
+    vqrshrun.s16 d0, q2, #5
 
-	//Set a line of chroma MB
-	vst1.u32  {d0}, [r0], r1
+    //Set a line of chroma MB
+    vst1.u32  {d0}, [r0], r1
 
-	//Do the same processing for each line.
-	mov  r2, #7
+    //Do the same processing for each line.
+    mov  r2, #7
 loop_0_get_i_chroma_pred_plane:
-	vadd.s16   q2, q1
-	vqrshrun.s16 d0, q2, #5
-	vst1.u32  {d0}, [r0], r1
-	subs  r2, #1
-	bne  loop_0_get_i_chroma_pred_plane
+    vadd.s16   q2, q1
+    vqrshrun.s16 d0, q2, #5
+    vst1.u32  {d0}, [r0], r1
+    subs  r2, #1
+    bne  loop_0_get_i_chroma_pred_plane
 
 WELS_ASM_FUNC_END
 
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -54,7 +54,7 @@
 %endmacro
 
 %macro MMX_SumSub 3
-	movq    %3, %2
+    movq    %3, %2
     psubw   %2, %1
     paddw   %1, %3
 %endmacro
@@ -62,8 +62,8 @@
 %macro MMX_IDCT 6
     MMX_SumSub      %4, %5, %6
     MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
+    MMX_SumSub      %1, %4, %6
+    MMX_SumSub      %3, %5, %6
 %endmacro
 
 
@@ -96,13 +96,13 @@
     movq    mm2, [r2+16]
     movq    mm3, [r2+24]
 
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+    MMX_IDCT            mm1, mm2, mm3, mm4, mm0, mm6
     MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+    MMX_IDCT            mm3, mm0, mm4, mm2, mm1, mm6
 
-    WELS_Zero			mm7
-    WELS_DW32			mm6
+    WELS_Zero           mm7
+    WELS_DW32           mm6
 
     MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [r0]
     MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [r0+r1]
@@ -111,5 +111,5 @@
     MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [r0+r1]
 
 
-	emms
+    emms
     ret
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -36,10 +36,10 @@
 ;*
 ;*  History
 ;*      18/09/2009 Created
-;*		19/11/2010 Added
-;*					WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
-;*					WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
-;*					and WelsDecoderIChromaPredDcNA_mmx
+;*      19/11/2010 Added
+;*                  WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
+;*                  WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
+;*                  and WelsDecoderIChromaPredDcNA_mmx
 ;*
 ;*
 ;*************************************************************************/
@@ -65,7 +65,7 @@
 sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
 
 align 16
-mmx_01bytes:		times 16	db 1
+mmx_01bytes:        times 16    db 1
 
 align 16
 mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -81,86 +81,86 @@
 ;xmm0, xmm1, xmm2, eax, ecx
 ;lower 64 bits of xmm0 save the result
 %macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
+    movd        %1, [%4-1]
+    movdqa      %3, %1
+    punpcklbw   %1, %3
+    movdqa      %3, %1
+    punpcklbw   %1, %3
 
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
+    ;add            %4, %5
+    movd        %2, [%4+%5-1]
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    punpckldq   %1, %2
 %endmacro
 
 
 %macro LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpcklwd %1,   %3
+    lea     %5, [%5+2*%6]
+    movd    %4, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %4,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    lea     %5, [%5+2*%6]
+    punpcklbw %3,   %2
+    punpcklwd %4,   %3
+    punpckhdq %1,   %4
 %endmacro
 
 %macro SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
+    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
+    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
+    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
+    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
+    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
+    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
 %endmacro
 
 %macro COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+    movdqa      %2, [%1-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
 %macro COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+    movdqa      %2, [%1+%3-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
 %macro LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,%2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpckhwd %1,   %3
+    lea     %5, [%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
-        lea         r0, [r0+2*r1]
-        movzx		r3, byte [r0-0x01]
-        add			r2, r3
-        movzx		r3, byte [r0+r1-0x01]
-        add			r2, r3
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]
+    add         r2, r3
+    movzx       r3, byte [r0+r1-0x01]
+    add         r2, r3
 %endmacro
 
 ;*******************************************************************************
@@ -173,131 +173,131 @@
 ;*******************************************************************************
 ;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
 ;
-;	pPred must align to 16
+;   pPred must align to 16
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
-	movzx		r2,	byte [r0-1]
-	movd		xmm0,	r2d
-	pmuludq		xmm0,	[mmx_01bytes]
+    movzx       r2, byte [r0-1]
+    movd        xmm0,   r2d
+    pmuludq     xmm0,   [mmx_01bytes]
 
-	movzx		r2,	byte [r0+r1-1]
-	movd		xmm1,	r2d
-	pmuludq		xmm1,	[mmx_01bytes]
+    movzx       r2, byte [r0+r1-1]
+    movd        xmm1,   r2d
+    pmuludq     xmm1,   [mmx_01bytes]
 
-	lea			r0,	[r0+r1]
-	movzx		r2,	byte [r0+r1-1]
-	movd		xmm2,	r2d
-	pmuludq		xmm2,	[mmx_01bytes]
+    lea         r0, [r0+r1]
+    movzx       r2, byte [r0+r1-1]
+    movd        xmm2,   r2d
+    pmuludq     xmm2,   [mmx_01bytes]
 
-	movzx		r2,	byte [r0+2*r1-1]
-	movd		xmm3,	r2d
-	pmuludq		xmm3,	[mmx_01bytes]
+    movzx       r2, byte [r0+2*r1-1]
+    movd        xmm3,   r2d
+    pmuludq     xmm3,   [mmx_01bytes]
 
-	sub         r0,    r1
-	movd        [r0], xmm0
-	movd        [r0+r1], xmm1
-	lea         r0, [r0+2*r1]
-	movd        [r0], xmm2
-	movd        [r0+r1], xmm3
+    sub         r0,    r1
+    movd        [r0], xmm0
+    movd        [r0+r1], xmm1
+    lea         r0, [r0+2*r1]
+    movd        [r0], xmm2
+    movd        [r0+r1], xmm3
 
-	ret
+    ret
 
 ;*******************************************************************************
 ; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r1, r1d
-		mov r4, r0 ; save r0 in r4
-		sub		r0,	1
-		sub		r0,	r1
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0 ; save r0 in r4
+    sub     r0, 1
+    sub     r0, r1
 
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r0]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r0 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
+    ;for H
+    pxor    xmm7,   xmm7
+    movq    xmm0,   [r0]
+    movdqa  xmm5,   [sse2_plane_dec]
+    punpcklbw xmm0, xmm7
+    pmullw  xmm0,   xmm5
+    movq    xmm1,   [r0 + 9]
+    movdqa  xmm6,   [sse2_plane_inc]
+    punpcklbw xmm1, xmm7
+    pmullw  xmm1,   xmm6
+    psubw   xmm1,   xmm0
 
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r2d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r2,	r2w
-		imul	r2,	5
-		add		r2,	32
-		sar		r2,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r2d	; xmm1 = b,b,b,b,b,b,b,b
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r2d,    xmm1        ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+    movsx   r2, r2w
+    imul    r2, 5
+    add     r2, 32
+    sar     r2, 6           ; b = (5 * H + 32) >> 6;
+    SSE2_Copy8Times xmm1, r2d   ; xmm1 = b,b,b,b,b,b,b,b
 
-		movzx	r3,	BYTE [r0+16]
-		sub	r0, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r0, r1
+    movzx   r3, BYTE [r0+16]
+    sub r0, 3
+    LOAD_COLUMN     xmm0, xmm2, xmm3, xmm4, r0, r1
 
-		add		r0,	3
-		movzx	r2,	BYTE [r0+8*r1]
-		add		r3,	r2
-		shl		r3,	4			;	a = (left[15*kiStride] + top[15]) << 4;
+    add     r0, 3
+    movzx   r2, BYTE [r0+8*r1]
+    add     r3, r2
+    shl     r3, 4           ;   a = (left[15*kiStride] + top[15]) << 4;
 
-		sub	r0, 3
-		add		r0,	r1
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r0, r1
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
+    sub r0, 3
+    add     r0, r1
+    LOAD_COLUMN     xmm7, xmm2, xmm3, xmm4, r0, r1
+    pxor    xmm4,   xmm4
+    punpckhbw xmm0, xmm4
+    pmullw  xmm0,   xmm5
+    punpckhbw xmm7, xmm4
+    pmullw  xmm7,   xmm6
+    psubw   xmm7,   xmm0
 
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r2d,   xmm7			; V
-		movsx	r2,	r2w
+    SUMW_HORIZON   xmm7,xmm0,xmm2
+    movd    r2d,   xmm7         ; V
+    movsx   r2, r2w
 
-		imul	r2,	5
-		add		r2,	32
-		sar		r2,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r2d		; xmm4 = c,c,c,c,c,c,c,c
+    imul    r2, 5
+    add     r2, 32
+    sar     r2, 6               ; c = (5 * V + 32) >> 6;
+    SSE2_Copy8Times xmm4, r2d       ; xmm4 = c,c,c,c,c,c,c,c
 
-		mov r0, r4
-		add		r3,	16
-		imul	r2,	-7
-		add		r3,	r2		; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+    mov r0, r4
+    add     r3, 16
+    imul    r2, -7
+    add     r3, r2      ; s = a + 16 + (-7)*c
+    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r2,	r2
-		movdqa	xmm5,	[sse2_plane_inc_minus]
+    xor     r2, r2
+    movdqa  xmm5,   [sse2_plane_inc_minus]
 
 get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	r1
-		inc		r2
-		cmp		r2,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    movdqa  xmm3,   xmm1
+    pmullw  xmm3,   xmm6
+    paddw   xmm3,   xmm0
+    psraw   xmm3,   5
+    packuswb xmm2,  xmm3
+    movdqa  [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, r1
+    inc     r2
+    cmp     r2, 16
+    jnz get_i16x16_luma_pred_plane_sse2_1
 
-		POP_XMM
-		pop r4
-		pop r3
-		ret
+    POP_XMM
+    pop r4
+    pop r3
+    ret
 
 
 
@@ -306,31 +306,31 @@
 ;*******************************************************************************
 
 %macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
-    lea     %1,	[%1+%2*2]
+    lea     %1, [%1+%2*2]
 
-    COPY_16_TIMES %1,	xmm0
-    movdqa  [%1],	xmm0
-    COPY_16_TIMESS %1,	xmm0,	%2
-    movdqa  [%1+%2],	xmm0
+    COPY_16_TIMES %1,   xmm0
+    movdqa  [%1],   xmm0
+    COPY_16_TIMESS %1,  xmm0,   %2
+    movdqa  [%1+%2],    xmm0
 %endmacro
 
 WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
-    COPY_16_TIMES r0,	xmm0
-    movdqa  [r0],		xmm0
-    COPY_16_TIMESS r0,	xmm0,	r1
-    movdqa  [r0+r1],	xmm0
+    COPY_16_TIMES r0,   xmm0
+    movdqa  [r0],       xmm0
+    COPY_16_TIMESS r0,  xmm0,   r1
+    movdqa  [r0+r1],    xmm0
 
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
 
     ret
 
@@ -338,9 +338,9 @@
 ; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
     sub     r0, r1
     movdqa  xmm0, [r0]
@@ -376,252 +376,252 @@
 ; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r1, r1d
-		mov r4, r0
-		sub		r0,	1
-		sub		r0,	r1
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
+    sub     r0, 1
+    sub     r0, r1
 
-		pxor	mm7,	mm7
-		movq	mm0,	[r0]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r0 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
+    pxor    mm7,    mm7
+    movq    mm0,    [r0]
+    movq    mm5,    [sse2_plane_dec_c]
+    punpcklbw mm0,  mm7
+    pmullw  mm0,    mm5
+    movq    mm1,    [r0 + 5]
+    movq    mm6,    [sse2_plane_inc_c]
+    punpcklbw mm1,  mm7
+    pmullw  mm1,    mm6
+    psubw   mm1,    mm0
 
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r2d,	xmm1
-		movsx	r2,	r2w
-		imul	r2,	17
-		add		r2,	16
-		sar		r2,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r2d	; mm1 = b,b,b,b,b,b,b,b
+    movq2dq xmm1,   mm1
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r2d,    xmm1
+    movsx   r2, r2w
+    imul    r2, 17
+    add     r2, 16
+    sar     r2, 5           ; b = (17 * H + 16) >> 5;
+    SSE2_Copy8Times xmm1, r2d   ; mm1 = b,b,b,b,b,b,b,b
 
-		movzx	r3,	BYTE [r0+8]
-		sub	r0, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r0, r1
+    movzx   r3, BYTE [r0+8]
+    sub r0, 3
+    LOAD_COLUMN_C   mm0, mm2, mm3, mm4, r0, r1
 
-		add		r0,	3
-		movzx	r2,	BYTE [r0+4*r1]
-		add		r3,	r2
-		shl		r3,	4			; a = (left[7*kiStride] + top[7]) << 4;
+    add     r0, 3
+    movzx   r2, BYTE [r0+4*r1]
+    add     r3, r2
+    shl     r3, 4           ; a = (left[7*kiStride] + top[7]) << 4;
 
-		sub	r0, 3
-		add		r0,	r1
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r0, r1
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
+    sub r0, 3
+    add     r0, r1
+    LOAD_COLUMN_C   mm7, mm2, mm3, mm4, r0, r1
+    pxor    mm4,    mm4
+    punpckhbw mm0,  mm4
+    pmullw  mm0,    mm5
+    punpckhbw mm7,  mm4
+    pmullw  mm7,    mm6
+    psubw   mm7,    mm0
 
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r2d,    xmm7			; V
-		movsx	r2,	r2w
+    movq2dq xmm7,   mm7
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm7,xmm0,xmm2
+    movd    r2d,    xmm7            ; V
+    movsx   r2, r2w
 
-		imul	r2,	17
-		add		r2,	16
-		sar		r2,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
+    imul    r2, 17
+    add     r2, 16
+    sar     r2, 5               ; c = (17 * V + 16) >> 5;
+    SSE2_Copy8Times xmm4, r2d       ; mm4 = c,c,c,c,c,c,c,c
 
-		mov 	r0, r4
-		add		r3,	16
-		imul	r2,	-3
-		add		r3,	r2				; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+    mov     r0, r4
+    add     r3, 16
+    imul    r2, -3
+    add     r3, r2              ; s = a + 16 + (-3)*c
+    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r2,	r2
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
+    xor     r2, r2
+    movdqa  xmm5,   [sse2_plane_mul_b_c]
 
 get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	r1
-		inc		r2
-		cmp		r2,	8
-		jnz get_i_chroma_pred_plane_sse2_1
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    packuswb xmm2,  xmm2
+    movq    [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, r1
+    inc     r2
+    cmp     r2, 8
+    jnz get_i_chroma_pred_plane_sse2_1
 
-		POP_XMM
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
+    POP_XMM
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 ;*******************************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pPred[7] = ([6]+[0]*2+[1]+2)/4
+;   0 |1 |2 |3 |4 |
+;   6 |7 |8 |9 |10|
+;   11|12|13|14|15|
+;   16|17|18|19|20|
+;   21|22|23|24|25|
+;   7 is the start pixel of current 4x4 block
+;   pPred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	movq        mm1,[r2+r1-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r2-8]			;get value of 6 mm2[8] = 6
-	sub		r2, r1			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r2-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r2]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea 		r2,[r2+r1*2-8h]		;set eax point to 12
-	movq        mm4,[r2+r1]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r2+r1*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+    movq        mm1,[r2+r1-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+    movq        mm2,[r2-8]          ;get value of 6 mm2[8] = 6
+    sub     r2, r1          ;mov eax to above line of current block(postion of 1)
+    punpckhbw   mm2,[r2-8]          ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+    movd        mm3,[r2]            ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+    punpckhwd   mm1,mm2             ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+    psllq       mm3,18h             ;mm3[5]=[1]
+    psrlq       mm1,28h             ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    por         mm3,mm1             ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+    movq        mm1,mm3             ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    lea         r2,[r2+r1*2-8h]     ;set eax point to 12
+    movq        mm4,[r2+r1]     ;get value of 16, mm4[8]=[16]
+    psllq       mm3,8               ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[16]
+    por         mm3,mm4             ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+    movq        mm2,mm3             ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+    movq        mm4,[r2+r1*2]       ;mm4[8]=[21]
+    psllq       mm3,8               ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[21]
+    por         mm3,mm4             ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+    movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+    pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
+    pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
+    pand        mm1,[mmx_01bytes]   ;set the odd bit
+    psubusb     mm3,mm1             ;decrease 1 from odd bytes
+    pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
-	lea         r0,[r0+r1]
-	movd        [r0+2*r1],mm2
-	sub         r0,r1
-	psrlq       mm2,8
-	movd        [r0+2*r1],mm2
-	psrlq       mm2,8
-	movd        [r0+r1],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
+    lea         r0,[r0+r1]
+    movd        [r0+2*r1],mm2
+    sub         r0,r1
+    psrlq       mm2,8
+    movd        [r0+2*r1],mm2
+    psrlq       mm2,8
+    movd        [r0+r1],mm2
+    psrlq       mm2,8
+    movd        [r0],mm2
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
 ;   copy 8 pixel of 8 line from left
 ;*******************************************************************************
 %macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3-8]
+    psrlq       %1,     38h
 
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r1-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3+r1-8]
+    psrlq       %1,     38h
 
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 WELS_EXTERN WelsDecoderIChromaPredH_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	movq		mm0,	[r2-8]
-	psrlq		mm0,	38h
+    movq        mm0,    [r2-8]
+    psrlq       mm0,    38h
 
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
+    pmullw      mm0,        [mmx_01bytes]
+    pshufw      mm0,    mm0,    0
+    movq        [r0],   mm0
 
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+    lea         r2, [r2+r1*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
 
-	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    lea         r0, [r0+2*r1]
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+    lea         r2, [r2+r1*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
 
-	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    lea         r0, [r0+2*r1]
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+    lea         r2, [r2+r1*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
 
-    	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    lea         r0, [r0+2*r1]
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	WELSEMMS
-	ret
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
 ;   copy 8 pixels from top 8 pixels
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredV_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
-	sub			r0,		r1
-	movq		mm0,		[r0]
+    sub         r0,     r1
+    movq        mm0,        [r0]
 
-	movq		[r0+r1],		mm0
-	movq		[r0+2*r1],	mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
+    movq        [r0+r1],        mm0
+    movq        [r0+2*r1],  mm0
+    lea         r0, [r0+2*r1]
+    movq        [r0+r1],      mm0
+    movq        [r0+2*r1],    mm0
+    lea         r0, [r0+2*r1]
+    movq        [r0+r1],      mm0
+    movq        [r0+2*r1],    mm0
+    lea         r0, [r0+2*r1]
+    movq        [r0+r1],      mm0
+    movq        [r0+2*r1],    mm0
 
-	WELSEMMS
-	ret
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
+;   |a |b |c |d |
+;   |e |f |a |b |
+;   |g |h |e |f |
+;   |i |j |g |h |
 
 ;   a = (1 + lt + l0)>>1
 ;   e = (1 + l0 + l1)>>1
@@ -640,73 +640,73 @@
 ;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub         r2, r1
+    movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+    psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
 
-	movd        mm1, [r2+2*r1-4]
-	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r2, [r2+2*r1]
-	movd        mm2, [r2+2*r1-4]
-	punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+    movd        mm1, [r2+2*r1-4]
+    punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r2, [r2+2*r1]
+    movd        mm2, [r2+2*r1-4]
+    punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+    psrlq       mm2, 20h
+    pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
 
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
+    movq        mm1, mm0
+    psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+    movq        mm2, mm0
+    psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+    movq        mm3, mm2
+    movq        mm4, mm1
+    pavgb       mm1, mm0
 
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+    pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
 
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+    movq        mm4, mm0
+    pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+    punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
 
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+    psrlq       mm2, 20h
+    psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+    movq        mm4, mm3
+    psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+    pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
 
-	movd        [r0], mm2
-	lea         r0, [r0+r1]
-	movd        [r0+2*r1], mm3
-	sub         r0, r1
-	psrlq       mm3, 10h
-	movd        [r0+2*r1], mm3
-	psrlq       mm3, 10h
-	movd        [r0+r1], mm3
-	WELSEMMS
-	ret
+    movd        [r0], mm2
+    lea         r0, [r0+r1]
+    movd        [r0+2*r1], mm3
+    sub         r0, r1
+    psrlq       mm3, 10h
+    movd        [r0+2*r1], mm3
+    psrlq       mm3, 10h
+    movd        [r0+r1], mm3
+    WELSEMMS
+    ret
 
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
+;   |a |b |c |d |
+;   |c |d |e |f |
+;   |e |f |g |g |
+;   |g |g |g |g |
 
 ;   a = (1 + l0 + l1)>>1
 ;   c = (1 + l1 + l2)>>1
@@ -722,74 +722,74 @@
 ;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	movd        mm0, [r2-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r2, [r2+2*r1]
-	movd        mm2, [r2-4]            ; mm2[3] = l2
-	movd        mm4, [r2+r1-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+    movd        mm0, [r2-4]            ; mm0[3] = l0
+    punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
+    lea         r2, [r2+2*r1]
+    movd        mm2, [r2-4]            ; mm2[3] = l2
+    movd        mm4, [r2+r1-4]        ; mm4[3] = l3
+    punpcklbw   mm2, mm4
+    punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
 
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+    psrlq       mm4, 18h
+    psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+    psrlq       mm0, 8h
+    pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+    movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+    pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+    movq        mm5, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
+    pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm5, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+    pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
 
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
 
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+    punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
 
-	psrlq       mm4, 20h
-	lea         r0, [r0+r1]
-	movd        [r0+2*r1], mm4
+    psrlq       mm4, 20h
+    lea         r0, [r0+r1]
+    movd        [r0+2*r1], mm4
 
-	sub         r0, r1
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+r1], mm1
-	psrlq       mm1, 10h
-	movd        [r0+2*r1], mm1
-	WELSEMMS
-	ret
+    sub         r0, r1
+    movd        [r0], mm1
+    psrlq       mm1, 10h
+    movd        [r0+r1], mm1
+    psrlq       mm1, 10h
+    movd        [r0+2*r1], mm1
+    WELSEMMS
+    ret
 
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   l3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |i |a |b |c |
+;   |j |e |f |g |
 
 ;   a = (1 + lt + t0)>>1
 ;   b = (1 + t0 + t1)>>1
@@ -807,77 +807,77 @@
 ;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub         r2, r1
+    movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+    psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
 
-	movd        mm1, [r2+2*r1-4]
-	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r2, [r2+2*r1]
-	movq        mm2, [r2+r1-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+    movd        mm1, [r2+2*r1-4]
+    punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r2, [r2+2*r1]
+    movq        mm2, [r2+r1-8]        ; mm2[7] = l2
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+    psrlq       mm2, 28h
+    pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+    movq        mm3, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
+    pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
+    movq        mm3, mm0
+    psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+    movq        mm2, mm3
 
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
+    psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+    movd        [r0], mm1
 
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+r1], mm2
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+    movd        [r0+r1], mm2
 
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+    movq        mm4, mm3
+    psllq       mm4, 20h
+    psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
 
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+    movq        mm5, mm3
+    psllq       mm5, 28h
+    psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
 
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+2*r1], mm4
+    psllq       mm1, 8h
+    pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+    movd        [r0+2*r1], mm4
 
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm5
-	WELSEMMS
-	ret
+    psllq       mm2, 8h
+    pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+    lea         r0, [r0+2*r1]
+    movd        [r0+r1], mm5
+    WELSEMMS
+    ret
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
+;   |a |b |c |d |
+;   |b |c |d |e |
+;   |c |d |e |f |
+;   |d |e |f |g |
 
 ;   a = (2 + t0 + t2 + (t1<<1))>>2
 ;   b = (2 + t1 + t3 + (t2<<1))>>2
@@ -893,56 +893,56 @@
 ;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub         r2, r1
+    movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+    movq        mm3, mm0
+    psrlq       mm3, 38h
+    psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
 
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+    psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
 
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
+    movq        mm3, mm1
+    pavgb       mm1, mm2
+    pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+    pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
 
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+r1], mm0
-	psrlq       mm0, 8h
-	movd        [r0+2*r1], mm0
-	psrlq       mm0, 8h
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm0
-	WELSEMMS
-	ret
+    psrlq       mm0, 8h
+    movd        [r0], mm0
+    psrlq       mm0, 8h
+    movd        [r0+r1], mm0
+    psrlq       mm0, 8h
+    movd        [r0+2*r1], mm0
+    psrlq       mm0, 8h
+    lea         r0, [r0+2*r1]
+    movd        [r0+r1], mm0
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |b |c |d |i |
+;   |f |g |h |j |
 
 ;   a = (1 + t0 + t1)>>1
 ;   b = (1 + t1 + t2)>>1
@@ -961,40 +961,40 @@
 ;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	sub         r2, r1
-	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    sub         r2, r1
+    movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+    psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+    psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+    movq        mm3, mm1
+    pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
 
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
+    movq        mm4, mm2
+    pavgb       mm2, mm0
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+    pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
 
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+2*r1], mm3
+    movd        [r0], mm3
+    psrlq       mm3, 8h
+    movd        [r0+2*r1], mm3
 
-	movd        [r0+r1], mm2
-	psrlq       mm2, 8h
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm2
-	WELSEMMS
-	ret
+    movd        [r0+r1], mm2
+    psrlq       mm2, 8h
+    lea         r0, [r0+2*r1]
+    movd        [r0+r1], mm2
+    WELSEMMS
+    ret
 
 ;*******************************************************************************
 ;
@@ -1001,93 +1001,93 @@
 ;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDc_sse2
-	push 	r3
-	push 	r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
+    push    r3
+    push    r4
+    %assign push_num 2
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
 
-	sub         r0, r1
-	movq        mm0, [r0]
+    sub         r0, r1
+    movq        mm0, [r0]
 
-	movzx		r2, byte [r0+r1-0x01] ; l1
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l2
-	add			r2, r3
-	movzx		r3, byte [r0+r1-0x01] ; l3
-	add			r2, r3
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l4
-	add			r2, r3
-	movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
+    movzx       r2, byte [r0+r1-0x01] ; l1
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l2
+    add         r2, r3
+    movzx       r3, byte [r0+r1-0x01] ; l3
+    add         r2, r3
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l4
+    add         r2, r3
+    movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
 
-	movzx		r2, byte [r0+r1-0x01] ; l5
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l6
-	add			r2, r3
-	movzx		r3, byte [r0+r1-0x01] ; l7
-	add			r2, r3
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l8
-	add			r2, r3
-	movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
+    movzx       r2, byte [r0+r1-0x01] ; l5
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l6
+    add         r2, r3
+    movzx       r3, byte [r0+r1-0x01] ; l7
+    add         r2, r3
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l8
+    add         r2, r3
+    movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
 
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+    movq        mm3, mm0
+    psrlq       mm0, 0x20
+    psllq       mm3, 0x20
+    psrlq       mm3, 0x20
+    pxor        mm4, mm4
+    psadbw      mm0, mm4
+    psadbw      mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
 
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+    paddq       mm3, mm1
+    movq        mm1, mm2
+    paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-	movq        mm4, [mmx_0x02]
+    movq        mm4, [mmx_0x02]
 
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
+    paddq       mm0, mm4
+    psrlq       mm0, 0x02
 
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
+    paddq       mm2, mm4
+    psrlq       mm2, 0x02
 
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
+    paddq       mm3, mm4
+    paddq       mm3, mm4
+    psrlq       mm3, 0x03
 
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
+    paddq       mm1, mm4
+    paddq       mm1, mm4
+    psrlq       mm1, 0x03
 
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
+    pmuludq     mm0, [mmx_01bytes]
+    pmuludq     mm3, [mmx_01bytes]
+    psllq       mm0, 0x20
+    pxor        mm0, mm3                 ; mm0 = m_up
 
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
+    pmuludq     mm2, [mmx_01bytes]
+    pmuludq     mm1, [mmx_01bytes]
+    psllq       mm1, 0x20
+    pxor        mm1, mm2                 ; mm2 = m_down
 
-	movq        [r4],       mm0
-	movq        [r4+r1],   mm0
-	movq        [r4+2*r1], mm0
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm0
+    movq        [r4],       mm0
+    movq        [r4+r1],   mm0
+    movq        [r4+2*r1], mm0
+    lea         r4, [r4+2*r1]
+    movq        [r4+r1],   mm0
 
-	movq        [r4+2*r1], mm1
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm1
-	movq        [r4+2*r1], mm1
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm1
+    movq        [r4+2*r1], mm1
+    lea         r4, [r4+2*r1]
+    movq        [r4+r1],   mm1
+    movq        [r4+2*r1], mm1
+    lea         r4, [r4+2*r1]
+    movq        [r4+r1],   mm1
 
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 
 
@@ -1096,75 +1096,75 @@
 ;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
-	push 	r3
-	push 	r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-	sub         r0, r1
-	movdqa      xmm0, [r0]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
+    push    r3
+    push    r4
+    %assign push_num 2
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
+    sub         r0, r1
+    movdqa      xmm0, [r0]             ; read one row
+    pxor        xmm1, xmm1
+    psadbw      xmm0, xmm1
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 0x08
+    pslldq      xmm0, 0x08
+    psrldq      xmm0, 0x08
+    paddw       xmm0, xmm1
 
-	movzx		r2, byte [r0+r1-0x01]
-	movzx		r3, byte [r0+2*r1-0x01]
-	add		r2, r3
-	lea    		r0, [r0+r1]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r2, 0x10
-	movd        xmm1, r2d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
+    movzx       r2, byte [r0+r1-0x01]
+    movzx       r3, byte [r0+2*r1-0x01]
+    add     r2, r3
+    lea         r0, [r0+r1]
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    add         r2, 0x10
+    movd        xmm1, r2d
+    paddw       xmm0, xmm1
+    psrld       xmm0, 0x05
+    pmuludq     xmm0, [mmx_01bytes]
+    pshufd      xmm0, xmm0, 0
 
-	movdqa      [r4],       xmm0
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4],       xmm0
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
+    movdqa      [r4+r1],   xmm0
 
-	pop r4
-	pop r3
+    pop r4
+    pop r3
 
-	ret
+    ret
 
 ;*******************************************************************************
 ; for intra prediction as follows, 11/19/2010
@@ -1171,239 +1171,239 @@
 ;*******************************************************************************
 
 ;*******************************************************************************
-;	void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub r2, r1
-	movdqa xmm0, [r2]		; pPred-kiStride, top line
-	pxor xmm7, xmm7
-	psadbw xmm0, xmm7
-	movdqa xmm1, xmm0
-	psrldq xmm1, 8
-	paddw  xmm0, xmm1
-	xor r2, r2
-	movd r2d, xmm0
-	;movdqa xmm1, xmm0
-	;punpcklbw xmm0, xmm7
-	;punpckhbw xmm1, xmm7
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub r2, r1
+    movdqa xmm0, [r2]       ; pPred-kiStride, top line
+    pxor xmm7, xmm7
+    psadbw xmm0, xmm7
+    movdqa xmm1, xmm0
+    psrldq xmm1, 8
+    paddw  xmm0, xmm1
+    xor r2, r2
+    movd r2d, xmm0
+    ;movdqa xmm1, xmm0
+    ;punpcklbw xmm0, xmm7
+    ;punpckhbw xmm1, xmm7
 
-	;paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
-	;pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
-	;paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
-	;pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
-	;paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
-	;pshuflw xmm1, xmm0, 0b1h	; 10110001
-	;paddw xmm0, xmm1			; sum in word unit (x8)
-	;xor r3, r3
-	;movd r3d, xmm0
-	;and edx, 0ffffh
+    ;paddw xmm0, xmm1           ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+    ;pshufd xmm1, xmm0, 04eh        ; 01001110, w3w2w1w0,w7w6w5w4
+    ;paddw xmm0, xmm1           ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+    ;pshufd xmm1, xmm0, 0b1h        ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+    ;paddw xmm0, xmm1           ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+    ;pshuflw xmm1, xmm0, 0b1h   ; 10110001
+    ;paddw xmm0, xmm1           ; sum in word unit (x8)
+    ;xor r3, r3
+    ;movd r3d, xmm0
+    ;and edx, 0ffffh
 
-	add r2, 8
-	sar r2, 4
-	SSE2_Copy16Times xmm1, r2d
-	;mov dh, dl
-	;mov r2, edx
-	;shl r2, 010h
-	;or edx, r2
-	;movd xmm1, edx
-	;pshufd xmm0, xmm1, 00h
-	;movdqa xmm1, xmm0
-	movdqa xmm0, xmm1
-	lea r2, [2*r1+r1]		; 3*kiStride
+    add r2, 8
+    sar r2, 4
+    SSE2_Copy16Times xmm1, r2d
+    ;mov dh, dl
+    ;mov r2, edx
+    ;shl r2, 010h
+    ;or edx, r2
+    ;movd xmm1, edx
+    ;pshufd xmm0, xmm1, 00h
+    ;movdqa xmm1, xmm0
+    movdqa xmm0, xmm1
+    lea r2, [2*r1+r1]       ; 3*kiStride
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	POP_XMM
-	ret
+    POP_XMM
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	lea r2, [2*r1+r1]		; 3*kiStride
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    lea r2, [2*r1+r1]       ; 3*kiStride
 
-	movdqa xmm0, [sse2_dc_0x80]
-	movdqa xmm1, xmm0
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    movdqa xmm0, [sse2_dc_0x80]
+    movdqa xmm1, xmm0
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	ret
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-	; for left
-	dec r0
-	xor r2, r2
-	xor r3, r3
-	movzx r2, byte [r0]
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	lea r0, [r0+2*r1]
-	movzx r3, byte [r0]
-	add r2, r3
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	add r2, 02h
-	sar r2, 02h
-	;SSE2_Copy16Times mm0, r2d
-	mov r3, r2
-	sal r3, 8
-	or r2, r3
-	movd mm1, r2d
-	pshufw mm0, mm1, 00h
-	;mov bh, bl
-	;movd mm1, ebx
-	;pshufw mm0, mm1, 00h	; up64
-	movq mm1, mm0
-	xor r2, r2
-	lea r0, [r0+2*r1]
-	movzx r2, byte [r0]
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	lea r0, [r0+2*r1]
-	movzx r3, byte [r0]
-	add r2, r3
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	add r2, 02h
-	sar r2, 02h
-	mov r3, r2
-	sal r3, 8
-	or r2, r3
-	movd mm3, r2d
-	pshufw mm2, mm3, 00h
-	;mov bh, bl
-	;movd mm3, ebx
-	;pshufw mm2, mm3, 00h	; down64
-	;SSE2_Copy16Times mm2, r2d
-	movq mm3, mm2
-	lea r2, [2*r1+r1]
-	movq [r4], mm0
-	movq [r4+r1], mm1
-	movq [r4+2*r1], mm0
-	movq [r4+r2], mm1
-	lea r4, [r4+4*r1]
-	movq [r4], mm2
-	movq [r4+r1], mm3
-	movq [r4+2*r1], mm2
-	movq [r4+r2], mm3
-	pop r4
-	pop r3
-	emms
-	ret
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
+    ; for left
+    dec r0
+    xor r2, r2
+    xor r3, r3
+    movzx r2, byte [r0]
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    lea r0, [r0+2*r1]
+    movzx r3, byte [r0]
+    add r2, r3
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    add r2, 02h
+    sar r2, 02h
+    ;SSE2_Copy16Times mm0, r2d
+    mov r3, r2
+    sal r3, 8
+    or r2, r3
+    movd mm1, r2d
+    pshufw mm0, mm1, 00h
+    ;mov bh, bl
+    ;movd mm1, ebx
+    ;pshufw mm0, mm1, 00h   ; up64
+    movq mm1, mm0
+    xor r2, r2
+    lea r0, [r0+2*r1]
+    movzx r2, byte [r0]
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    lea r0, [r0+2*r1]
+    movzx r3, byte [r0]
+    add r2, r3
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    add r2, 02h
+    sar r2, 02h
+    mov r3, r2
+    sal r3, 8
+    or r2, r3
+    movd mm3, r2d
+    pshufw mm2, mm3, 00h
+    ;mov bh, bl
+    ;movd mm3, ebx
+    ;pshufw mm2, mm3, 00h   ; down64
+    ;SSE2_Copy16Times mm2, r2d
+    movq mm3, mm2
+    lea r2, [2*r1+r1]
+    movq [r4], mm0
+    movq [r4+r1], mm1
+    movq [r4+2*r1], mm0
+    movq [r4+r2], mm1
+    lea r4, [r4+4*r1]
+    movq [r4], mm2
+    movq [r4+r1], mm3
+    movq [r4+2*r1], mm2
+    movq [r4+r2], mm3
+    pop r4
+    pop r3
+    emms
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub r2, r1
-	movq xmm0, [r2]		; top: 8x1 pixels
-	pxor xmm7, xmm7
-	punpcklbw xmm0, xmm7		; ext 8x2 words
-	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
-	paddw xmm0, xmm1			; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
-	movdqa xmm1, xmm0
-	pshuflw xmm2, xmm0, 0B1h	; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
-	pshufhw xmm3, xmm1, 0B1h	; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
-	paddw xmm0, xmm2			; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
-	paddw xmm1, xmm3			; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
-	punpckhqdq xmm1, xmm7
-	punpcklqdq xmm0, xmm1		; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
-	movdqa xmm6, [sse2_wd_0x02]
-	paddw xmm0, xmm6
-	psraw xmm0, 02h
-	packuswb xmm0, xmm7
-	lea r2, [2*r1+r1]
-	movq [r0], xmm0
-	movq [r0+r1], xmm0
-	movq [r0+2*r1], xmm0
-	movq [r0+r2], xmm0
-	lea r0, [r0+4*r1]
-	movq [r0], xmm0
-	movq [r0+r1], xmm0
-	movq [r0+2*r1], xmm0
-	movq [r0+r2], xmm0
-	POP_XMM
-	ret
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub r2, r1
+    movq xmm0, [r2]     ; top: 8x1 pixels
+    pxor xmm7, xmm7
+    punpcklbw xmm0, xmm7        ; ext 8x2 words
+    pshufd xmm1, xmm0, 0B1h     ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+    paddw xmm0, xmm1            ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+    movdqa xmm1, xmm0
+    pshuflw xmm2, xmm0, 0B1h    ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+    pshufhw xmm3, xmm1, 0B1h    ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+    paddw xmm0, xmm2            ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+    paddw xmm1, xmm3            ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+    punpckhqdq xmm1, xmm7
+    punpcklqdq xmm0, xmm1       ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+    movdqa xmm6, [sse2_wd_0x02]
+    paddw xmm0, xmm6
+    psraw xmm0, 02h
+    packuswb xmm0, xmm7
+    lea r2, [2*r1+r1]
+    movq [r0], xmm0
+    movq [r0+r1], xmm0
+    movq [r0+2*r1], xmm0
+    movq [r0+r2], xmm0
+    lea r0, [r0+4*r1]
+    movq [r0], xmm0
+    movq [r0+r1], xmm0
+    movq [r0+2*r1], xmm0
+    movq [r0+r2], xmm0
+    POP_XMM
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	lea r2, [2*r1+r1]
-	movq mm0, [sse2_dc_0x80]
-	movq mm1, mm0
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	movq [r0+2*r1], mm0
-	movq [r0+r2], mm1
-	lea r0, [r0+4*r1]
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	movq [r0+2*r1], mm0
-	movq [r0+r2], mm1
-	emms
-	ret
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    lea r2, [2*r1+r1]
+    movq mm0, [sse2_dc_0x80]
+    movq mm1, mm0
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    movq [r0+2*r1], mm0
+    movq [r0+r2], mm1
+    lea r0, [r0+4*r1]
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    movq [r0+2*r1], mm0
+    movq [r0+r2], mm1
+    emms
+    ret
 
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -38,107 +38,107 @@
 #ifdef __APPLE__
 //Global macro
 .macro GET_8BYTE_DATA
-	vld1.8 {$0[0]}, [$1], $2
-	vld1.8 {$0[1]}, [$1], $2
-	vld1.8 {$0[2]}, [$1], $2
-	vld1.8 {$0[3]}, [$1], $2
-	vld1.8 {$0[4]}, [$1], $2
-	vld1.8 {$0[5]}, [$1], $2
-	vld1.8 {$0[6]}, [$1], $2
-	vld1.8 {$0[7]}, [$1], $2
+    vld1.8 {$0[0]}, [$1], $2
+    vld1.8 {$0[1]}, [$1], $2
+    vld1.8 {$0[2]}, [$1], $2
+    vld1.8 {$0[3]}, [$1], $2
+    vld1.8 {$0[4]}, [$1], $2
+    vld1.8 {$0[5]}, [$1], $2
+    vld1.8 {$0[6]}, [$1], $2
+    vld1.8 {$0[7]}, [$1], $2
 .endm
 #else
 //Global macro
 .macro GET_8BYTE_DATA arg0, arg1, arg2
-	vld1.8 {\arg0[0]}, [\arg1], \arg2
-	vld1.8 {\arg0[1]}, [\arg1], \arg2
-	vld1.8 {\arg0[2]}, [\arg1], \arg2
-	vld1.8 {\arg0[3]}, [\arg1], \arg2
-	vld1.8 {\arg0[4]}, [\arg1], \arg2
-	vld1.8 {\arg0[5]}, [\arg1], \arg2
-	vld1.8 {\arg0[6]}, [\arg1], \arg2
-	vld1.8 {\arg0[7]}, [\arg1], \arg2
+    vld1.8 {\arg0[0]}, [\arg1], \arg2
+    vld1.8 {\arg0[1]}, [\arg1], \arg2
+    vld1.8 {\arg0[2]}, [\arg1], \arg2
+    vld1.8 {\arg0[3]}, [\arg1], \arg2
+    vld1.8 {\arg0[4]}, [\arg1], \arg2
+    vld1.8 {\arg0[5]}, [\arg1], \arg2
+    vld1.8 {\arg0[6]}, [\arg1], \arg2
+    vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
-	//Get the top line data to 'q0'
-	sub  r3, r1, r2
-	vldm r3, {d0, d1}
+    //Get the top line data to 'q0'
+    sub  r3, r1, r2
+    vldm r3, {d0, d1}
 
-	//mov  r2, #16
-	mov  r3, #4
-	//Set the top line to the each line of MB(16*16)
+    //mov  r2, #16
+    mov  r3, #4
+    //Set the top line to the each line of MB(16*16)
 loop_0_get_i16x16_luma_pred_v:
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_v
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_v
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
     //stmdb sp!, {r4, lr}
-	sub  r1, r1, #1
-	mov  r3, #4
+    sub  r1, r1, #1
+    mov  r3, #4
 loop_0_get_i16x16_luma_pred_h:
-	//Get one byte data from left side
-	vld1.8 {d0[],d1[]}, [r1], r2
-	vld1.8 {d2[],d3[]}, [r1], r2
-	vld1.8 {d4[],d5[]}, [r1], r2
-	vld1.8 {d6[],d7[]}, [r1], r2
+    //Get one byte data from left side
+    vld1.8 {d0[],d1[]}, [r1], r2
+    vld1.8 {d2[],d3[]}, [r1], r2
+    vld1.8 {d4[],d5[]}, [r1], r2
+    vld1.8 {d6[],d7[]}, [r1], r2
 
-	//Set the line of MB using the left side byte data
-	vst1.8 {d0,d1}, [r0]!
-	//add r0, #16
-	vst1.8 {d2,d3}, [r0]!
-	//add r0, #16
-	vst1.8 {d4,d5}, [r0]!
-	//add r0, #16
-	vst1.8 {d6,d7}, [r0]!
-	//add r0, #16
+    //Set the line of MB using the left side byte data
+    vst1.8 {d0,d1}, [r0]!
+    //add r0, #16
+    vst1.8 {d2,d3}, [r0]!
+    //add r0, #16
+    vst1.8 {d4,d5}, [r0]!
+    //add r0, #16
+    vst1.8 {d6,d7}, [r0]!
+    //add r0, #16
 
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_h
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_h
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the left vertical line data
-	sub r3, r1, #1
-	GET_8BYTE_DATA d0, r3, r2
-	GET_8BYTE_DATA d1, r3, r2
+    //stmdb sp!, { r2-r5, lr}
+    //Get the left vertical line data
+    sub r3, r1, #1
+    GET_8BYTE_DATA d0, r3, r2
+    GET_8BYTE_DATA d1, r3, r2
 
-	//Get the top horizontal line data
-	sub  r3, r1, r2
-	vldm r3, {d2, d3}
+    //Get the top horizontal line data
+    sub  r3, r1, r2
+    vldm r3, {d2, d3}
 
-	//Calculate the sum of top horizontal line data and vertical line data
-	vpaddl.u8 q0, q0
-	vpaddl.u8 q1, q1
-	vadd.u16  q0, q0, q1
-	vadd.u16  d0, d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the sum of top horizontal line data and vertical line data
+    vpaddl.u8 q0, q0
+    vpaddl.u8 q1, q1
+    vadd.u16  q0, q0, q1
+    vadd.u16  d0, d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, d0, #5
-	vdup.8     q0, d0[0]
+    //Calculate the mean value
+    vrshr.u16  d0, d0, #5
+    vdup.8     q0, d0[0]
 
-	//Set the mean value to the all of member of MB
-	mov  r3, #4
+    //Set the mean value to the all of member of MB
+    mov  r3, #4
 loop_0_get_i16x16_luma_pred_dc_both:
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_dc_both
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_dc_both
 
 WELS_ASM_FUNC_END
 
@@ -151,383 +151,383 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
-	//stmdb sp!, { r4, lr}
+    //stmdb sp!, { r4, lr}
 
-	//Load the table {(8,7,6,5,4,3,2,1) * 5}
-	adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
-	vldr    d0, [r3]
+    //Load the table {(8,7,6,5,4,3,2,1) * 5}
+    adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
+    vldr    d0, [r3]
 
-	//Pack the top[-1] ~ top[6] to d1
-	sub       r3,  r1, r2
-	sub       r1,  r3, #1
-	vld1.8    d1, [r1]
+    //Pack the top[-1] ~ top[6] to d1
+    sub       r3,  r1, r2
+    sub       r1,  r3, #1
+    vld1.8    d1, [r1]
 
-	//Pack the top[8] ~ top[15] to d2
-	add       r1, #9
-	vld1.8    d2, [r1]
+    //Pack the top[8] ~ top[15] to d2
+    add       r1, #9
+    vld1.8    d2, [r1]
 
-	//Save the top[15] to d6 for next step
-	vdup.u8   d6,   d2[7]
+    //Save the top[15] to d6 for next step
+    vdup.u8   d6,   d2[7]
 
-	//Get and pack left[-1] ~ left[6] to d4
-	sub       r1,  r3, #1
-	GET_8BYTE_DATA d4, r1, r2
+    //Get and pack left[-1] ~ left[6] to d4
+    sub       r1,  r3, #1
+    GET_8BYTE_DATA d4, r1, r2
 
-	//Get and pack left[8] ~ left[15] to d3
-	add       r1,  r2
-	GET_8BYTE_DATA d3, r1, r2
+    //Get and pack left[8] ~ left[15] to d3
+    add       r1,  r2
+    GET_8BYTE_DATA d3, r1, r2
 
-	//Save the left[15] to d7 for next step
-	vdup.u8   d7,   d3[7]
+    //Save the left[15] to d7 for next step
+    vdup.u8   d7,   d3[7]
 
-	//revert the sequence of d2,d3
-	vrev64.8   q1, q1
+    //revert the sequence of d2,d3
+    vrev64.8   q1, q1
 
-	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
-	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+    vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+    vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
 
 
-	vmovl.u8   q0, d0
-	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
-	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+    vmovl.u8   q0, d0
+    vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+    vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
 
-	//Calculate the sum of items of q1, q2
-	vpadd.s16  d0, d2, d3
-	vpadd.s16  d1, d4, d5
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
+    //Calculate the sum of items of q1, q2
+    vpadd.s16  d0, d2, d3
+    vpadd.s16  d1, d4, d5
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
 
-	//Get the value of 'b', 'c' and extend to q1, q2.
-	vrshr.s64  q0, #6
-	vdup.s16   q1, d0[0]
-	vdup.s16   q2, d1[0]
+    //Get the value of 'b', 'c' and extend to q1, q2.
+    vrshr.s64  q0, #6
+    vdup.s16   q1, d0[0]
+    vdup.s16   q2, d1[0]
 
-	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
-	adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
-	vld1.32   {d0}, [r3]
+    //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+    adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
+    vld1.32   {d0}, [r3]
 
-	//Get the value of 'a' and save to q3
-	vaddl.u8  q3, d6, d7
-	vshl.u16  q3, #4
+    //Get the value of 'a' and save to q3
+    vaddl.u8  q3, d6, d7
+    vshl.u16  q3, #4
 
-	//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
-	vmovl.s8  q0, d0
-	vmla.s16  q3, q0, q1
-	vmla.s16  q3, q2, d0[0]
+    //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+    vmovl.s8  q0, d0
+    vmla.s16  q3, q0, q1
+    vmla.s16  q3, q2, d0[0]
 
-	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
-	vshl.s16  q8, q1, #3
-	vadd.s16  q8, q3
+    //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+    vshl.s16  q8, q1, #3
+    vadd.s16  q8, q3
 
-	//right shift 5 bits and rounding
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
+    //right shift 5 bits and rounding
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
 
-	//Set the line of MB
-	vst1.u32  {d0,d1}, [r0]!
+    //Set the line of MB
+    vst1.u32  {d0,d1}, [r0]!
 
 
-	//Do the same processing for setting other lines
-	mov  r3, #15
+    //Do the same processing for setting other lines
+    mov  r3, #15
 loop_0_get_i16x16_luma_pred_plane:
-	vadd.s16  q3, q2
-	vadd.s16  q8, q2
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
-	vst1.u32  {d0,d1}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_plane
+    vadd.s16  q3, q2
+    vadd.s16  q8, q2
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
+    vst1.u32  {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_plane
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub  r3, r1, r2
-	ldr  r3, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub  r3, r1, r2
+    ldr  r3, [r3]
 
-	//Set the luma MB using top line
-	str  r3, [r0], #4
-	str  r3, [r0], #4
-	str  r3, [r0], #4
-	str  r3, [r0]
+    //Set the luma MB using top line
+    str  r3, [r0], #4
+    str  r3, [r0], #4
+    str  r3, [r0], #4
+    str  r3, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the left column (4 bytes)
-	sub  r3, r1, #1
-	vld1.8 {d0[]}, [r3], r2
-	vld1.8 {d1[]}, [r3], r2
-	vld1.8 {d2[]}, [r3], r2
-	vld1.8 {d3[]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column (4 bytes)
+    sub  r3, r1, #1
+    vld1.8 {d0[]}, [r3], r2
+    vld1.8 {d1[]}, [r3], r2
+    vld1.8 {d2[]}, [r3], r2
+    vld1.8 {d3[]}, [r3]
 
-	//Set the luma MB using the left side byte
-	vst1.32 {d0[0]}, [r0]!
-	vst1.32 {d1[0]}, [r0]!
-	vst1.32 {d2[0]}, [r0]!
-	vst1.32 {d3[0]}, [r0]
+    //Set the luma MB using the left side byte
+    vst1.32 {d0[0]}, [r0]!
+    vst1.32 {d1[0]}, [r0]!
+    vst1.32 {d2[0]}, [r0]!
+    vst1.32 {d3[0]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data(8 bytes)
-	sub    r3,  r1, r2
-	vld1.32  {d0}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data(8 bytes)
+    sub    r3,  r1, r2
+    vld1.32  {d0}, [r3]
 
-	//For "t7 + (t7<<1)"
-	vdup.8   d1,  d0[7]
+    //For "t7 + (t7<<1)"
+    vdup.8   d1,  d0[7]
 
-	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
-	vext.8   d1,  d0, d1, #1
-	vaddl.u8 q1,  d1, d0
+    //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+    vext.8   d1,  d0, d1, #1
+    vaddl.u8 q1,  d1, d0
 
-	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
-	vext.8   q2,  q1, q1, #14
-	vadd.u16 q0,  q1, q2
+    //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+    vext.8   q2,  q1, q1, #14
+    vadd.u16 q0,  q1, q2
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16  d0,  q0, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16  d0,  q0, #2
 
-	//Save "ddl0, ddl1, ddl2, ddl3"
-	vext.8   d1, d0, d0, #1
-	vst1.32  d1[0], [r0]!
+    //Save "ddl0, ddl1, ddl2, ddl3"
+    vext.8   d1, d0, d0, #1
+    vst1.32  d1[0], [r0]!
 
-	//Save "ddl1, ddl2, ddl3, ddl4"
-	vext.8   d1, d0, d0, #2
-	vst1.32  d1[0], [r0]!
+    //Save "ddl1, ddl2, ddl3, ddl4"
+    vext.8   d1, d0, d0, #2
+    vst1.32  d1[0], [r0]!
 
-	//Save "ddl2, ddl3, ddl4, ddl5"
-	vext.8   d1, d0, d0, #3
-	vst1.32  d1[0], [r0]!
+    //Save "ddl2, ddl3, ddl4, ddl5"
+    vext.8   d1, d0, d0, #3
+    vst1.32  d1[0], [r0]!
 
-	//Save "ddl3, ddl4, ddl5, ddl6"
-	vst1.32  d0[1], [r0]
+    //Save "ddl3, ddl4, ddl5, ddl6"
+    vst1.32  d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub    r3,  r1, r2
-	vld1.32  {d0[1]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub    r3,  r1, r2
+    vld1.32  {d0[1]}, [r3]
 
-	//Load the left column (5 bytes)
-	sub    r3,  #1
-	vld1.8 {d0[3]}, [r3], r2
-	vld1.8 {d0[2]}, [r3], r2
-	vld1.8 {d0[1]}, [r3], r2
-	vld1.8 {d0[0]}, [r3], r2
-	vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
+    //Load the left column (5 bytes)
+    sub    r3,  #1
+    vld1.8 {d0[3]}, [r3], r2
+    vld1.8 {d0[2]}, [r3], r2
+    vld1.8 {d0[1]}, [r3], r2
+    vld1.8 {d0[0]}, [r3], r2
+    vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
 
 
-	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
-	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+    vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+                              //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
 
-	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
-	vaddl.u8 q2, d2, d0
+    //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+    vaddl.u8 q2, d2, d0
 
-	//q1:{TL0+LT0,LT0+T01,...L12+L23}
-	vext.8   q3, q3, q2, #14
-	vadd.u16 q1, q2, q3
+    //q1:{TL0+LT0,LT0+T01,...L12+L23}
+    vext.8   q3, q3, q2, #14
+    vadd.u16 q1, q2, q3
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16 d0, q1, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16 d0, q1, #2
 
-	//Adjust the data sequence for setting luma MB of 'pred'
-	vst1.32   d0[1], [r0]!
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]!
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]!
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]
+    //Adjust the data sequence for setting luma MB of 'pred'
+    vst1.32   d0[1], [r0]!
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]!
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]!
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (8 bytes)
-	sub    r3,  r1, r2
-	vld1.32  {d0}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (8 bytes)
+    sub    r3,  r1, r2
+    vld1.32  {d0}, [r3]
 
 
-	vext.8   d1,  d0, d0, #1
-	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+    vext.8   d1,  d0, d0, #1
+    vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
 
-	vext.8   q2,  q1, q1, #2
-	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+    vext.8   q2,  q1, q1, #2
+    vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
 
-	//calculate the "vl0,vl1,vl2,vl3,vl4"
-	vqrshrn.u16  d0,  q1, #1
+    //calculate the "vl0,vl1,vl2,vl3,vl4"
+    vqrshrn.u16  d0,  q1, #1
 
-	//calculate the "vl5,vl6,vl7,vl8,vl9"
-	vqrshrn.u16  d1,  q2, #2
+    //calculate the "vl5,vl6,vl7,vl8,vl9"
+    vqrshrn.u16  d1,  q2, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[0], [r0]!
-	vst1.32  d1[0], [r0]!
-	vext.8   d0,  d0, d0, #1
-	vext.8   d1,  d1, d1, #1
-	vst1.32  d0[0], [r0]!
-	vst1.32  d1[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[0], [r0]!
+    vst1.32  d1[0], [r0]!
+    vext.8   d0,  d0, d0, #1
+    vext.8   d1,  d1, d1, #1
+    vst1.32  d0[0], [r0]!
+    vst1.32  d1[0], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub       r3,  r1, r2
-	vld1.32   {d0[1]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub       r3,  r1, r2
+    vld1.32   {d0[1]}, [r3]
 
-	//Load the left column (4 bytes)
-	sub       r3,  #1
-	vld1.8    {d0[3]}, [r3], r2
-	vld1.8    {d0[2]}, [r3], r2
-	vld1.8    {d0[1]}, [r3], r2
-	vld1.8    {d0[0]}, [r3]
+    //Load the left column (4 bytes)
+    sub       r3,  #1
+    vld1.8    {d0[3]}, [r3], r2
+    vld1.8    {d0[2]}, [r3], r2
+    vld1.8    {d0[1]}, [r3], r2
+    vld1.8    {d0[0]}, [r3]
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
 
-	vext.u8   q2, q1, q1, #14
-	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+    vext.u8   q2, q1, q1, #14
+    vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
 
-	//Calculate the vr0 ~ vr9
-	vqrshrn.u16 d1, q2, #2
-	vqrshrn.u16 d0, q1, #1
+    //Calculate the vr0 ~ vr9
+    vqrshrn.u16 d1, q2, #2
+    vqrshrn.u16 d0, q1, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[1], [r0]!
-	vst1.32  d1[1], [r0]!
-	//add    r2, r0, r1
-	vst1.8   d1[3], [r0]!
-	vst1.16  d0[2], [r0]!
-	vst1.8   d0[6], [r0]!
-	vst1.8   d1[2], [r0]!
-	vst1.16  d1[2], [r0]!
-	vst1.8   d1[6], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[1], [r0]!
+    vst1.32  d1[1], [r0]!
+    //add    r2, r0, r1
+    vst1.8   d1[3], [r0]!
+    vst1.16  d0[2], [r0]!
+    vst1.8   d0[6], [r0]!
+    vst1.8   d1[2], [r0]!
+    vst1.16  d1[2], [r0]!
+    vst1.8   d1[6], [r0]
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
-	//stmdb sp!, { r4, lr}
-	//Load the left column data
-	sub       r3,  r1, #1
-	mov       r1,  #3
-	mul       r1,  r2
-	add       r1,  r3
-	vld1.8    {d0[]},  [r1]
-	vld1.8    {d0[4]}, [r3], r2
-	vld1.8    {d0[5]}, [r3], r2
-	vld1.8    {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+    //stmdb sp!, { r4, lr}
+    //Load the left column data
+    sub       r3,  r1, #1
+    mov       r1,  #3
+    mul       r1,  r2
+    add       r1,  r3
+    vld1.8    {d0[]},  [r1]
+    vld1.8    {d0[4]}, [r3], r2
+    vld1.8    {d0[5]}, [r3], r2
+    vld1.8    {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
 
-	vext.8    d1, d0, d0, #1
-	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+    vext.8    d1, d0, d0, #1
+    vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
 
-	vext.u8   d2, d5, d4, #2
-	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+    vext.u8   d2, d5, d4, #2
+    vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
 
-	//Calculate the hu0 ~ hu5
-	vqrshrn.u16 d2, q2, #1
-	vqrshrn.u16 d1, q1, #2
+    //Calculate the hu0 ~ hu5
+    vqrshrn.u16 d2, q2, #1
+    vqrshrn.u16 d1, q1, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vzip.8   d2, d1
-	vst1.32  d1[0], [r0]!
-	vext.8   d2, d1, d1, #2
-	vst1.32  d2[0], [r0]!
-	vst1.32  d1[1], [r0]!
-	vst1.32  d0[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vzip.8   d2, d1
+    vst1.32  d1[0], [r0]!
+    vext.8   d2, d1, d1, #2
+    vst1.32  d2[0], [r0]!
+    vst1.32  d1[1], [r0]!
+    vst1.32  d0[0], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the data
-	sub       r3,  r1, r2
-	sub       r3,  #1
-	vld1.32   {d0[1]}, [r3], r2
-	vld1.8    {d0[3]}, [r3], r2
-	vld1.8    {d0[2]}, [r3], r2
-	vld1.8    {d0[1]}, [r3], r2
-	vld1.8    {d0[0]}, [r3]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+    //stmdb sp!, { r2-r5, lr}
+    //Load the data
+    sub       r3,  r1, r2
+    sub       r3,  #1
+    vld1.32   {d0[1]}, [r3], r2
+    vld1.8    {d0[3]}, [r3], r2
+    vld1.8    {d0[2]}, [r3], r2
+    vld1.8    {d0[1]}, [r3], r2
+    vld1.8    {d0[0]}, [r3]     //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
 
-	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
-	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+    vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+    vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
 
-	//Calculate the hd0~hd9
-	vqrshrn.u16 d1, q3, #2
-	vqrshrn.u16 d0, q2, #1
+    //Calculate the hd0~hd9
+    vqrshrn.u16 d1, q3, #2
+    vqrshrn.u16 d0, q2, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vmov      d3, d1
-	vtrn.8    d0, d1
-	vext.u8   d2, d1, d1, #6
-	vst2.16  {d2[3], d3[3]}, [r0]!
-	vst2.16  {d0[2], d1[2]}, [r0]!
-	vmov     d3, d0
-	vst2.16  {d2[2], d3[2]}, [r0]!
-	vst2.16  {d0[1], d1[1]}, [r0]
+    //Adjust the data sequence for setting the luma MB
+    vmov      d3, d1
+    vtrn.8    d0, d1
+    vext.u8   d2, d1, d1, #6
+    vst2.16  {d2[3], d3[3]}, [r0]!
+    vst2.16  {d0[2], d1[2]}, [r0]!
+    vmov     d3, d0
+    vst2.16  {d2[2], d3[2]}, [r0]!
+    vst2.16  {d0[1], d1[1]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the top row (8 byte)
-	sub  r3, r1, r2
-	vldr d0, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Get the top row (8 byte)
+    sub  r3, r1, r2
+    vldr d0, [r3]
 
-	//Set the chroma MB using top row data
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]
+    //Set the chroma MB using top row data
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	////Get the left column (8 byte)
-	sub  r3, r1, #1
-	vld1.8 {d0[]}, [r3], r2
-	vld1.8 {d1[]}, [r3], r2
-	vld1.8 {d2[]}, [r3], r2
-	vld1.8 {d3[]}, [r3], r2
-	vld1.8 {d4[]}, [r3], r2
-	vld1.8 {d5[]}, [r3], r2
-	vld1.8 {d6[]}, [r3], r2
-	vld1.8 {d7[]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    ////Get the left column (8 byte)
+    sub  r3, r1, #1
+    vld1.8 {d0[]}, [r3], r2
+    vld1.8 {d1[]}, [r3], r2
+    vld1.8 {d2[]}, [r3], r2
+    vld1.8 {d3[]}, [r3], r2
+    vld1.8 {d4[]}, [r3], r2
+    vld1.8 {d5[]}, [r3], r2
+    vld1.8 {d6[]}, [r3], r2
+    vld1.8 {d7[]}, [r3]
 
-	//Set the chroma MB using left column data
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d1}, [r0]!
-	vst1.8 {d2}, [r0]!
-	vst1.8 {d3}, [r0]!
-	vst1.8 {d4}, [r0]!
-	vst1.8 {d5}, [r0]!
-	vst1.8 {d6}, [r0]!
-	vst1.8 {d7}, [r0]
+    //Set the chroma MB using left column data
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d1}, [r0]!
+    vst1.8 {d2}, [r0]!
+    vst1.8 {d3}, [r0]!
+    vst1.8 {d4}, [r0]!
+    vst1.8 {d5}, [r0]!
+    vst1.8 {d6}, [r0]!
+    vst1.8 {d7}, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -575,73 +575,73 @@
 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
 
 WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data
-	sub  r3, r1, #1
-	sub  r3, r2
-	vld1.32 {d1[0]}, [r3]
-	add  r3, #5
-	vld1.32 {d0[0]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data
+    sub  r3, r1, #1
+    sub  r3, r2
+    vld1.32 {d1[0]}, [r3]
+    add  r3, #5
+    vld1.32 {d0[0]}, [r3]
 
-	//Load the left column data
-	sub  r3, #5
-	vld1.8 {d1[4]}, [r3], r2
-	vld1.8 {d1[5]}, [r3], r2
-	vld1.8 {d1[6]}, [r3], r2
-	vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
-	add  r3, r2
-	vld1.8 {d0[4]}, [r3], r2
-	vld1.8 {d0[5]}, [r3], r2
-	vld1.8 {d0[6]}, [r3], r2
-	vld1.8 {d0[7]}, [r3]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+    //Load the left column data
+    sub  r3, #5
+    vld1.8 {d1[4]}, [r3], r2
+    vld1.8 {d1[5]}, [r3], r2
+    vld1.8 {d1[6]}, [r3], r2
+    vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+    add  r3, r2
+    vld1.8 {d0[4]}, [r3], r2
+    vld1.8 {d0[5]}, [r3], r2
+    vld1.8 {d0[6]}, [r3], r2
+    vld1.8 {d0[7]}, [r3]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
 
 
-	//Save T7 to d3 for next step
-	vdup.u8   d3,   d0[3]
-	//Save L7 to d4 for next step
-	vdup.u8   d4,   d0[7]
+    //Save T7 to d3 for next step
+    vdup.u8   d3,   d0[3]
+    //Save L7 to d4 for next step
+    vdup.u8   d4,   d0[7]
 
-	//Calculate the value of 'a' and save to q2
-	vaddl.u8  q2, d3, d4
-	vshl.u16  q2, #4
+    //Calculate the value of 'a' and save to q2
+    vaddl.u8  q2, d3, d4
+    vshl.u16  q2, #4
 
-	//Load the table {{1,2,3,4,1,2,3,4}*17}
-	adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d2}, [r3]
+    //Load the table {{1,2,3,4,1,2,3,4}*17}
+    adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d2}, [r3]
 
-	//Calculate the 'b','c', and save to q0
-	vrev32.8  d1, d1
-	vsubl.u8  q0, d0, d1
-	vmovl.u8   q1, d2
-	vmul.s16   q0, q1
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
-	vrshr.s64  q0, #5
+    //Calculate the 'b','c', and save to q0
+    vrev32.8  d1, d1
+    vsubl.u8  q0, d0, d1
+    vmovl.u8   q1, d2
+    vmul.s16   q0, q1
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
+    vrshr.s64  q0, #5
 
-	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
-	adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d6, d7}, [r3]
+    //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+    adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d6, d7}, [r3]
 
-	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
-	vdup.s16   q1, d1[0]
-	vdup.s16   q0, d0[0]
+    //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+    vdup.s16   q1, d1[0]
+    vdup.s16   q0, d0[0]
 
-	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
-	vmla.s16   q2, q0, q3
-	vmla.s16   q2, q1, d6[0]
-	vqrshrun.s16 d0, q2, #5
+    //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+    vmla.s16   q2, q0, q3
+    vmla.s16   q2, q1, d6[0]
+    vqrshrun.s16 d0, q2, #5
 
-	//Set a line of chroma MB
-	vst1.u32  {d0}, [r0]!
+    //Set a line of chroma MB
+    vst1.u32  {d0}, [r0]!
 
-	//Do the same processing for each line.
-	mov  r3, #7
+    //Do the same processing for each line.
+    mov  r3, #7
 loop_0_get_i_chroma_pred_plane:
-	vadd.s16   q2, q1
-	vqrshrun.s16 d0, q2, #5
-	vst1.u32  {d0}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i_chroma_pred_plane
+    vadd.s16   q2, q1
+    vqrshrun.s16 d0, q2, #5
+    vst1.u32  {d0}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i_chroma_pred_plane
 
 WELS_ASM_FUNC_END
 
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -38,59 +38,59 @@
 #ifdef __APPLE__
  //The data sequence will be used
 .macro GET_8BYTE_DATA_L0
-	vld1.8 {$0[0]}, [$1], $2
-	vld1.8 {$0[1]}, [$1], $2
-	vld1.8 {$0[2]}, [$1], $2
-	vld1.8 {$0[3]}, [$1], $2
-	vld1.8 {$0[4]}, [$1], $2
-	vld1.8 {$0[5]}, [$1], $2
-	vld1.8 {$0[6]}, [$1], $2
-	vld1.8 {$0[7]}, [$1], $2
+    vld1.8 {$0[0]}, [$1], $2
+    vld1.8 {$0[1]}, [$1], $2
+    vld1.8 {$0[2]}, [$1], $2
+    vld1.8 {$0[3]}, [$1], $2
+    vld1.8 {$0[4]}, [$1], $2
+    vld1.8 {$0[5]}, [$1], $2
+    vld1.8 {$0[6]}, [$1], $2
+    vld1.8 {$0[7]}, [$1], $2
 .endm
 
 
 .macro HDM_TRANSFORM_4X4_L0
 
-	//Do the vertical transform
-	vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
-	vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
-	vswp  d1, d2
-	vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
-	vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+    //Do the vertical transform
+    vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
+    vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
+    vswp  d1, d2
+    vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+    vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
 
-	//Do the horizontal transform
-	vtrn.32 q2, q1
-	vadd.s16 q0, q2, q1
-	vsub.s16 q1, q2, q1
+    //Do the horizontal transform
+    vtrn.32 q2, q1
+    vadd.s16 q0, q2, q1
+    vsub.s16 q1, q2, q1
 
-	vtrn.16 q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
+    vtrn.16 q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
 
-	vmov.s16 d0, d4
-	vmov.s16 d1, d2
+    vmov.s16 d0, d4
+    vmov.s16 d1, d2
 
-	vabs.s16 d3, d3
+    vabs.s16 d3, d3
 
-	//16x16_v
-	vtrn.32 d0, d1 //{0,1,3,2}
-	vaba.s16 $5, d0, $2 //16x16_v
-	vaba.s16 $5, d1, $8
-	vaba.s16 $5, d5, $8
-	vadd.u16 $5, d3
+    //16x16_v
+    vtrn.32 d0, d1 //{0,1,3,2}
+    vaba.s16 $5, d0, $2 //16x16_v
+    vaba.s16 $5, d1, $8
+    vaba.s16 $5, d5, $8
+    vadd.u16 $5, d3
 
-	//16x16_h
-	vtrn.16 d4, d5 //{0,4,12,8}
-	vaba.s16 $6, d4, $3 //16x16_h
-	vabs.s16 d2, d2
-	vabs.s16 d5, d5
-	vadd.u16 d2, d3
-	vadd.u16 d2, d5
-	vadd.u16 $6, d2
+    //16x16_h
+    vtrn.16 d4, d5 //{0,4,12,8}
+    vaba.s16 $6, d4, $3 //16x16_h
+    vabs.s16 d2, d2
+    vabs.s16 d5, d5
+    vadd.u16 d2, d3
+    vadd.u16 d2, d5
+    vadd.u16 $6, d2
 
-	//16x16_dc_both
-	vaba.s16 $7, d4, $4 //16x16_dc_both
-	vadd.u16 $7, d2
+    //16x16_dc_both
+    vaba.s16 $7, d4, $4 //16x16_dc_both
+    vadd.u16 $7, d2
 
 .endm
 
@@ -97,58 +97,58 @@
 #else
  //The data sequence will be used
 .macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
-	vld1.8 {\arg0[0]}, [\arg1], \arg2
-	vld1.8 {\arg0[1]}, [\arg1], \arg2
-	vld1.8 {\arg0[2]}, [\arg1], \arg2
-	vld1.8 {\arg0[3]}, [\arg1], \arg2
-	vld1.8 {\arg0[4]}, [\arg1], \arg2
-	vld1.8 {\arg0[5]}, [\arg1], \arg2
-	vld1.8 {\arg0[6]}, [\arg1], \arg2
-	vld1.8 {\arg0[7]}, [\arg1], \arg2
+    vld1.8 {\arg0[0]}, [\arg1], \arg2
+    vld1.8 {\arg0[1]}, [\arg1], \arg2
+    vld1.8 {\arg0[2]}, [\arg1], \arg2
+    vld1.8 {\arg0[3]}, [\arg1], \arg2
+    vld1.8 {\arg0[4]}, [\arg1], \arg2
+    vld1.8 {\arg0[5]}, [\arg1], \arg2
+    vld1.8 {\arg0[6]}, [\arg1], \arg2
+    vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
 
 .macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8
 
-	//Do the vertical transform
-	vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
-	vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
-	vswp  d1, d2
-	vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
-	vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+    //Do the vertical transform
+    vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
+    vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
+    vswp  d1, d2
+    vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+    vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
 
-	//Do the horizontal transform
-	vtrn.32 q2, q1
-	vadd.s16 q0, q2, q1
-	vsub.s16 q1, q2, q1
+    //Do the horizontal transform
+    vtrn.32 q2, q1
+    vadd.s16 q0, q2, q1
+    vsub.s16 q1, q2, q1
 
-	vtrn.16 q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
+    vtrn.16 q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
 
-	vmov.s16 d0, d4
-	vmov.s16 d1, d2
+    vmov.s16 d0, d4
+    vmov.s16 d1, d2
 
-	vabs.s16 d3, d3
+    vabs.s16 d3, d3
 
-	//16x16_v
-	vtrn.32 d0, d1 //{0,1,3,2}
-	vaba.s16 \arg5, d0, \arg2 //16x16_v
-	vaba.s16 \arg5, d1, \arg8
-	vaba.s16 \arg5, d5, \arg8
-	vadd.u16 \arg5, d3
+    //16x16_v
+    vtrn.32 d0, d1 //{0,1,3,2}
+    vaba.s16 \arg5, d0, \arg2 //16x16_v
+    vaba.s16 \arg5, d1, \arg8
+    vaba.s16 \arg5, d5, \arg8
+    vadd.u16 \arg5, d3
 
-	//16x16_h
-	vtrn.16 d4, d5 //{0,4,12,8}
-	vaba.s16 \arg6, d4, \arg3 //16x16_h
-	vabs.s16 d2, d2
-	vabs.s16 d5, d5
-	vadd.u16 d2, d3
-	vadd.u16 d2, d5
-	vadd.u16 \arg6, d2
+    //16x16_h
+    vtrn.16 d4, d5 //{0,4,12,8}
+    vaba.s16 \arg6, d4, \arg3 //16x16_h
+    vabs.s16 d2, d2
+    vabs.s16 d5, d5
+    vadd.u16 d2, d3
+    vadd.u16 d2, d5
+    vadd.u16 \arg6, d2
 
-	//16x16_dc_both
-	vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
-	vadd.u16 \arg7, d2
+    //16x16_dc_both
+    vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
+    vadd.u16 \arg7, d2
 .endm
 #endif
 
@@ -156,63 +156,63 @@
     stmdb sp!, {r4-r7, lr}
     vpush {q4-q7}
 
-	//Get the top line data to 'q15'(16 bytes)
-	sub  r7, r0, r1
+    //Get the top line data to 'q15'(16 bytes)
+    sub  r7, r0, r1
     vld1.8 {q15}, [r7]
 
-	//Get the left colume data to 'q14' (16 bytes)
-	sub  r7, r0, #1
-	GET_8BYTE_DATA_L0 d28, r7, r1
-	GET_8BYTE_DATA_L0 d29, r7, r1
+    //Get the left colume data to 'q14' (16 bytes)
+    sub  r7, r0, #1
+    GET_8BYTE_DATA_L0 d28, r7, r1
+    GET_8BYTE_DATA_L0 d29, r7, r1
 
-	//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
-	//Calculate the 16x16_dc_both mode SATD
-	vaddl.u8 q0, d30, d31
-	vaddl.u8 q1, d28, d29
-	vadd.u16 q0, q1
-	vadd.u16 d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
+    //Calculate the 16x16_dc_both mode SATD
+    vaddl.u8 q0, d30, d31
+    vaddl.u8 q1, d28, d29
+    vadd.u16 q0, q1
+    vadd.u16 d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, #5
-	vshl.u16   d27, d0, #4
+    //Calculate the mean value
+    vrshr.u16  d0, #5
+    vshl.u16   d27, d0, #4
 
 
-	//Calculate the 16x16_v mode SATD and save to "q11, 12"
-	vshll.u8 q0, d30, #2
-	vshll.u8 q1, d31, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q12, q2, q1
-	vsub.s16 q11, q2, q1
-	vtrn.32  q12, q11 //{0,1,3,2, 4,5,7,6} q12
-	                  //{8,9,11,10, 12,13,15,14} q11
+    //Calculate the 16x16_v mode SATD and save to "q11, 12"
+    vshll.u8 q0, d30, #2
+    vshll.u8 q1, d31, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q12, q2, q1
+    vsub.s16 q11, q2, q1
+    vtrn.32  q12, q11 //{0,1,3,2, 4,5,7,6} q12
+                      //{8,9,11,10, 12,13,15,14} q11
     //Calculate the 16x16_h mode SATD and save to "q9, q10"
-	vshll.u8 q0, d28, #2
-	vshll.u8 q1, d29, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q10, q2, q1
-	vsub.s16 q9,  q2, q1
-	vtrn.32  q10, q9  //{0,1,3,2, 4,5,7,6} q10
-	                  //{8,9,11,10, 12,13,15,14} q9
+    vshll.u8 q0, d28, #2
+    vshll.u8 q1, d29, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q10, q2, q1
+    vsub.s16 q9,  q2, q1
+    vtrn.32  q10, q9  //{0,1,3,2, 4,5,7,6} q10
+                      //{8,9,11,10, 12,13,15,14} q9
 
-	vmov.i32 d17, #0//Save the SATD of DC_BOTH
-	vmov.i32 d16, #0//Save the SATD of H
-	vmov.i32 d15, #0//Save the SATD of V
-	vmov.i32 d14, #0//For zero D register
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    vmov.i32 d17, #0//Save the SATD of DC_BOTH
+    vmov.i32 d16, #0//Save the SATD of H
+    vmov.i32 d15, #0//Save the SATD of V
+    vmov.i32 d14, #0//For zero D register
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
@@ -219,13 +219,13 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
@@ -232,13 +232,13 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
@@ -245,13 +245,13 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
@@ -258,29 +258,29 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
 
-	//Get the data from stack
-	ldr r5, [sp, #84] //the addr of Best_mode
-	ldr r6, [sp, #88] //the value of i_lambda
+    //Get the data from stack
+    ldr r5, [sp, #84] //the addr of Best_mode
+    ldr r6, [sp, #88] //the value of i_lambda
 
-	//vadd.u16   d24, d25
-	vrshr.u16  d15, #1
-	vpaddl.u16 d15, d15
-	vpaddl.u32 d15, d15
-	vmov.u32   r0, d15[0]
+    //vadd.u16   d24, d25
+    vrshr.u16  d15, #1
+    vpaddl.u16 d15, d15
+    vpaddl.u32 d15, d15
+    vmov.u32   r0, d15[0]
 
-	//vadd.u16   d22, d23
-	vrshr.u16  d16, #1
-	vpaddl.u16 d16, d16
-	vpaddl.u32 d16, d16
-	vmov.u32   r1, d16[0]
-	add  r1, r1, r6, lsl #1
+    //vadd.u16   d22, d23
+    vrshr.u16  d16, #1
+    vpaddl.u16 d16, d16
+    vpaddl.u32 d16, d16
+    vmov.u32   r1, d16[0]
+    add  r1, r1, r6, lsl #1
 
-	//vadd.u16   d20, d21
-	vrshr.u16  d17, #1
-	vpaddl.u16 d17, d17
-	vpaddl.u32 d17, d17
-	vmov.u32   r2, d17[0]
-	add  r2, r2, r6, lsl #1
+    //vadd.u16   d20, d21
+    vrshr.u16  d17, #1
+    vpaddl.u16 d17, d17
+    vpaddl.u32 d17, d17
+    vmov.u32   r2, d17[0]
+    add  r2, r2, r6, lsl #1
 
     mov r4, #0
     cmp r1, r0
@@ -300,77 +300,77 @@
 WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
     stmdb sp!, {r4-r7, lr}
 
-	//Get the top line data to 'q15'(16 bytes)
-	sub  r4, r0, r1
+    //Get the top line data to 'q15'(16 bytes)
+    sub  r4, r0, r1
     vld1.8 {q15}, [r4]
 
-	//Get the left colume data to 'q14' (16 bytes)
-	sub  r4, r0, #1
-	GET_8BYTE_DATA_L0 d28, r4, r1
-	GET_8BYTE_DATA_L0 d29, r4, r1
+    //Get the left colume data to 'q14' (16 bytes)
+    sub  r4, r0, #1
+    GET_8BYTE_DATA_L0 d28, r4, r1
+    GET_8BYTE_DATA_L0 d29, r4, r1
 
-	//Calculate the mean value and save to 'q13' (8 bytes)
-	//Calculate the 16x16_dc_both mode SATD
-	vaddl.u8 q0, d30, d31
-	vaddl.u8 q1, d28, d29
-	vadd.u16 q0, q1
-	vadd.u16 d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the mean value and save to 'q13' (8 bytes)
+    //Calculate the 16x16_dc_both mode SATD
+    vaddl.u8 q0, d30, d31
+    vaddl.u8 q1, d28, d29
+    vadd.u16 q0, q1
+    vadd.u16 d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, d0, #5
-	vdup.8     q13, d0[0]
+    //Calculate the mean value
+    vrshr.u16  d0, d0, #5
+    vdup.8     q13, d0[0]
 
-	sub  r4, r0, #1
+    sub  r4, r0, #1
 
-	vmov.i32 q12, #0//Save the SATD of DC_BOTH
-	vmov.i32 q11, #0//Save the SATD of H
-	vmov.i32 q10, #0//Save the SATD of V
+    vmov.i32 q12, #0//Save the SATD of DC_BOTH
+    vmov.i32 q11, #0//Save the SATD of H
+    vmov.i32 q10, #0//Save the SATD of V
 
-	mov lr, #16
+    mov lr, #16
 sad_intra_16x16_x3_opt_loop0:
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d0[]}, [r4], r1
+    vld1.8 {d0[]}, [r4], r1
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
-	vld1.8  {q1}, [r2], r3
+    //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+    vld1.8  {q1}, [r2], r3
 
-	subs lr, #1
-	//Do the SAD for top colume
-	vabal.u8  q12, d30, d2
-	vabal.u8  q12, d31, d3
+    subs lr, #1
+    //Do the SAD for top colume
+    vabal.u8  q12, d30, d2
+    vabal.u8  q12, d31, d3
 
-	//Do the SAD for left colume
-	vabal.u8  q11, d0, d2
-	vabal.u8  q11, d0, d3
+    //Do the SAD for left colume
+    vabal.u8  q11, d0, d2
+    vabal.u8  q11, d0, d3
 
-	//Do the SAD for mean value
-	vabal.u8  q10, d26, d2
-	vabal.u8  q10, d26, d3
+    //Do the SAD for mean value
+    vabal.u8  q10, d26, d2
+    vabal.u8  q10, d26, d3
 
-	bne sad_intra_16x16_x3_opt_loop0
+    bne sad_intra_16x16_x3_opt_loop0
 
-	//Get the data from stack
-	ldr r5, [sp, #20] //the addr of Best_mode
-	ldr r6, [sp, #24] //the value of i_lambda
+    //Get the data from stack
+    ldr r5, [sp, #20] //the addr of Best_mode
+    ldr r6, [sp, #24] //the value of i_lambda
 
-	vadd.u16   d24, d25
-	vpaddl.u16 d24, d24
-	vpaddl.u32 d24, d24
-	vmov.u32   r0, d24[0]
+    vadd.u16   d24, d25
+    vpaddl.u16 d24, d24
+    vpaddl.u32 d24, d24
+    vmov.u32   r0, d24[0]
 
-	vadd.u16   d22, d23
-	vpaddl.u16 d22, d22
-	vpaddl.u32 d22, d22
-	vmov.u32   r1, d22[0]
-	add  r1, r1, r6, lsl #1
+    vadd.u16   d22, d23
+    vpaddl.u16 d22, d22
+    vpaddl.u32 d22, d22
+    vmov.u32   r1, d22[0]
+    add  r1, r1, r6, lsl #1
 
-	vadd.u16   d20, d21
-	vpaddl.u16 d20, d20
-	vpaddl.u32 d20, d20
-	vmov.u32   r2, d20[0]
-	add  r2, r2, r6, lsl #1
+    vadd.u16   d20, d21
+    vpaddl.u16 d20, d20
+    vpaddl.u32 d20, d20
+    vmov.u32   r2, d20[0]
+    add  r2, r2, r6, lsl #1
 
     mov r4, #0
     cmp r1, r0
@@ -382,7 +382,7 @@
 
     str r4, [r5]
 
-	ldmia sp!, {r4-r7, lr}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 
@@ -389,24 +389,24 @@
 WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
     stmdb sp!, {r4-r7, lr}
 
-	//Get the data from stack
-	ldr r4, [sp, #32] //p_dec_cr
-	ldr r5, [sp, #36] //p_enc_cr
+    //Get the data from stack
+    ldr r4, [sp, #32] //p_dec_cr
+    ldr r5, [sp, #36] //p_enc_cr
 
-	//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
-	sub  r6, r0, #1
-	GET_8BYTE_DATA_L0 d28, r6, r1
-	sub  r6, r4, #1
-	GET_8BYTE_DATA_L0 d30, r6, r1
+    //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+    sub  r6, r0, #1
+    GET_8BYTE_DATA_L0 d28, r6, r1
+    sub  r6, r4, #1
+    GET_8BYTE_DATA_L0 d30, r6, r1
 
-	//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
-	sub  r6, r0, r1
+    //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+    sub  r6, r0, r1
     vld1.8 {d29}, [r6]
-	sub  r6, r4, r1
+    sub  r6, r4, r1
     vld1.8 {d31}, [r6]
 
-	//Calculate the sum of left column and top row
-	vmov.i32   q0, q14
+    //Calculate the sum of left column and top row
+    vmov.i32   q0, q14
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
@@ -416,13 +416,13 @@
     //duplicate the 'mx' to a vector line
     vdup.8     d27, d2[0]
     vdup.8     d26, d1[4]
-	vtrn.32    d27, d26
+    vtrn.32    d27, d26
 
     vdup.8     d26, d0[4]
     vdup.8     d25, d2[4]
     vtrn.32    d26, d25   //Save to "d27, d26"
 
-	vmov.i32   q0, q15
+    vmov.i32   q0, q15
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
@@ -432,94 +432,94 @@
     //duplicate the 'mx' to a vector line
     vdup.8     d25, d2[0]
     vdup.8     d24, d1[4]
-	vtrn.32    d25, d24
+    vtrn.32    d25, d24
 
     vdup.8     d24, d0[4]
     vdup.8     d23, d2[4]
-	vtrn.32    d24, d23   //Save to "d25, d24"
+    vtrn.32    d24, d23   //Save to "d25, d24"
 
-	vmov.i32 q11, #0//Save the SATD of DC_BOTH
-	vmov.i32 q10, #0//Save the SATD of H
-	vmov.i32 q9 , #0//Save the SATD of V
-	sub  r6, r0, #1
-	sub  r7, r4, #1
-	mov lr, #4
+    vmov.i32 q11, #0//Save the SATD of DC_BOTH
+    vmov.i32 q10, #0//Save the SATD of H
+    vmov.i32 q9 , #0//Save the SATD of V
+    sub  r6, r0, #1
+    sub  r7, r4, #1
+    mov lr, #4
 sad_intra_8x8_x3_opt_loop0:
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
-	vld1.8  {d0}, [r2], r3
-	vld1.8  {d1}, [r5], r3
+    //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+    vld1.8  {d0}, [r2], r3
+    vld1.8  {d1}, [r5], r3
 
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d2[]}, [r6], r1
-	vld1.8 {d3[]}, [r7], r1
+    vld1.8 {d2[]}, [r6], r1
+    vld1.8 {d3[]}, [r7], r1
 
-	subs lr, #1
+    subs lr, #1
 
 
-	//Do the SAD for top colume
-	vabal.u8  q11, d29, d0
-	vabal.u8  q11, d31, d1
+    //Do the SAD for top colume
+    vabal.u8  q11, d29, d0
+    vabal.u8  q11, d31, d1
 
-	//Do the SAD for left colume
-	vabal.u8  q10, d2, d0
-	vabal.u8  q10, d3, d1
+    //Do the SAD for left colume
+    vabal.u8  q10, d2, d0
+    vabal.u8  q10, d3, d1
 
-	//Do the SAD for mean value
-	vabal.u8  q9, d27, d0
-	vabal.u8  q9, d25, d1
+    //Do the SAD for mean value
+    vabal.u8  q9, d27, d0
+    vabal.u8  q9, d25, d1
 
 
-	bne sad_intra_8x8_x3_opt_loop0
+    bne sad_intra_8x8_x3_opt_loop0
 
-	mov lr, #4
+    mov lr, #4
 sad_intra_8x8_x3_opt_loop1:
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
-	vld1.8  {d0}, [r2], r3
-	vld1.8  {d1}, [r5], r3
+    //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+    vld1.8  {d0}, [r2], r3
+    vld1.8  {d1}, [r5], r3
 
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d2[]}, [r6], r1
-	vld1.8 {d3[]}, [r7], r1
+    vld1.8 {d2[]}, [r6], r1
+    vld1.8 {d3[]}, [r7], r1
 
-	subs lr, #1
+    subs lr, #1
 
 
-	//Do the SAD for top colume
-	vabal.u8  q11, d29, d0
-	vabal.u8  q11, d31, d1
+    //Do the SAD for top colume
+    vabal.u8  q11, d29, d0
+    vabal.u8  q11, d31, d1
 
-	//Do the SAD for left colume
-	vabal.u8  q10, d2, d0
-	vabal.u8  q10, d3, d1
+    //Do the SAD for left colume
+    vabal.u8  q10, d2, d0
+    vabal.u8  q10, d3, d1
 
-	//Do the SAD for mean value
-	vabal.u8  q9, d26, d0
-	vabal.u8  q9, d24, d1
+    //Do the SAD for mean value
+    vabal.u8  q9, d26, d0
+    vabal.u8  q9, d24, d1
 
 
-	bne sad_intra_8x8_x3_opt_loop1
-	//Get the data from stack
-	ldr r5, [sp, #20] //the addr of Best_mode
-	ldr r6, [sp, #24] //the value of i_lambda
+    bne sad_intra_8x8_x3_opt_loop1
+    //Get the data from stack
+    ldr r5, [sp, #20] //the addr of Best_mode
+    ldr r6, [sp, #24] //the value of i_lambda
 
-	vadd.u16   d22, d23
-	vpaddl.u16 d22, d22
-	vpaddl.u32 d22, d22
-	vmov.u32   r0, d22[0]
-	add  r0, r0, r6, lsl #1
+    vadd.u16   d22, d23
+    vpaddl.u16 d22, d22
+    vpaddl.u32 d22, d22
+    vmov.u32   r0, d22[0]
+    add  r0, r0, r6, lsl #1
 
-	vadd.u16   d20, d21
-	vpaddl.u16 d20, d20
-	vpaddl.u32 d20, d20
-	vmov.u32   r1, d20[0]
-	add  r1, r1, r6, lsl #1
+    vadd.u16   d20, d21
+    vpaddl.u16 d20, d20
+    vpaddl.u32 d20, d20
+    vmov.u32   r1, d20[0]
+    add  r1, r1, r6, lsl #1
 
-	vadd.u16   d18, d19
-	vpaddl.u16 d18, d18
-	vpaddl.u32 d18, d18
-	vmov.u32   r2, d18[0]
+    vadd.u16   d18, d19
+    vpaddl.u16 d18, d18
+    vpaddl.u32 d18, d18
+    vmov.u32   r2, d18[0]
 
     mov r4, #2
     cmp r1, r0
@@ -531,7 +531,7 @@
 
     str r4, [r5]
 
-	ldmia sp!, {r4-r7, lr}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 
@@ -539,47 +539,47 @@
     stmdb sp!, {r4-r7, lr}
     vpush {q4-q7}
 
-	//Get the data from stack
-	ldr r4, [sp, #96] //p_dec_cr
-	ldr r5, [sp, #100] //p_enc_cr
+    //Get the data from stack
+    ldr r4, [sp, #96] //p_dec_cr
+    ldr r5, [sp, #100] //p_enc_cr
 
-	//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
-	sub  r6, r0, r1
+    //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+    sub  r6, r0, r1
     vld1.8 {d29}, [r6]
-	sub  r6, r4, r1
+    sub  r6, r4, r1
     vld1.8 {d31}, [r6]
 
-	//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
-	sub  r6, r0, #1
-	GET_8BYTE_DATA_L0 d28, r6, r1
-	sub  r6, r4, #1
-	GET_8BYTE_DATA_L0 d30, r6, r1
+    //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+    sub  r6, r0, #1
+    GET_8BYTE_DATA_L0 d28, r6, r1
+    sub  r6, r4, #1
+    GET_8BYTE_DATA_L0 d30, r6, r1
 
-	//Calculate the 16x16_v mode SATD and save to "q12, 13"
-	vshll.u8 q0, d29, #2
-	vshll.u8 q1, d31, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q13, q2, q1
-	vsub.s16 q12, q2, q1
-	vtrn.32  q13, q12 //{0,1,3,2, 4,5,7,6} q13
-	                  //{8,9,11,10, 12,13,15,14} q12
+    //Calculate the 16x16_v mode SATD and save to "q12, 13"
+    vshll.u8 q0, d29, #2
+    vshll.u8 q1, d31, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q13, q2, q1
+    vsub.s16 q12, q2, q1
+    vtrn.32  q13, q12 //{0,1,3,2, 4,5,7,6} q13
+                      //{8,9,11,10, 12,13,15,14} q12
     //Calculate the 16x16_h mode SATD and save to "q10, q11"
-	vshll.u8 q0, d28, #2
-	vshll.u8 q1, d30, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q11, q2, q1
-	vsub.s16 q10,  q2, q1
-	vtrn.32  q11, q10  //{0,1,3,2, 4,5,7,6} q11
-	                   //{8,9,11,10, 12,13,15,14} q10
+    vshll.u8 q0, d28, #2
+    vshll.u8 q1, d30, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q11, q2, q1
+    vsub.s16 q10,  q2, q1
+    vtrn.32  q11, q10  //{0,1,3,2, 4,5,7,6} q11
+                       //{8,9,11,10, 12,13,15,14} q10
 
-	//Calculate the sum of left column and top row
-	//vmov.i32   q0, q14
+    //Calculate the sum of left column and top row
+    //vmov.i32   q0, q14
     vpaddl.u8  q0, q14
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1
@@ -588,77 +588,77 @@
     vpaddl.u16 q2, q2
     vadd.u32   d3, d4, d5
 
-	vtrn.32    q0, q2
-	vrshr.u32  q1, #3
-	vrshr.u32  q2, #2
-	vshll.u32  q9, d4, #4 // {2cb, 2cr} q9
-	vshll.u32  q8, d5, #4 // {1cb, 1cr} q8
-	vshll.u32  q7, d2, #4 // {0cb, 3cb} q7
-	vshll.u32  q6, d3, #4 // {0cr, 3cr} q6
+    vtrn.32    q0, q2
+    vrshr.u32  q1, #3
+    vrshr.u32  q2, #2
+    vshll.u32  q9, d4, #4 // {2cb, 2cr} q9
+    vshll.u32  q8, d5, #4 // {1cb, 1cr} q8
+    vshll.u32  q7, d2, #4 // {0cb, 3cb} q7
+    vshll.u32  q6, d3, #4 // {0cr, 3cr} q6
 
 
     vmov.i32 d28, #0//Save the SATD of DC_BOTH
-	vmov.i32 d10, #0//Save the SATD of H
-	vmov.i32 d11, #0//Save the SATD of V
-	vmov.i32 d30, #0//For zero D register
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {d6}, [r2], r3
-	vld1.32  {d7}, [r2], r3
-	vld1.32  {d8}, [r2], r3
-	vld1.32  {d9}, [r2], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    vmov.i32 d10, #0//Save the SATD of H
+    vmov.i32 d11, #0//Save the SATD of V
+    vmov.i32 d30, #0//For zero D register
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {d6}, [r2], r3
+    vld1.32  {d7}, [r2], r3
+    vld1.32  {d8}, [r2], r3
+    vld1.32  {d9}, [r2], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
 
-	vld1.32  {d6}, [r5], r3
-	vld1.32  {d7}, [r5], r3
-	vld1.32  {d8}, [r5], r3
-	vld1.32  {d9}, [r5], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    vld1.32  {d6}, [r5], r3
+    vld1.32  {d7}, [r5], r3
+    vld1.32  {d8}, [r5], r3
+    vld1.32  {d9}, [r5], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {d6}, [r2], r3
-	vld1.32  {d7}, [r2], r3
-	vld1.32  {d8}, [r2], r3
-	vld1.32  {d9}, [r2], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {d6}, [r2], r3
+    vld1.32  {d7}, [r2], r3
+    vld1.32  {d8}, [r2], r3
+    vld1.32  {d9}, [r2], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
 
-	vld1.32  {d6}, [r5], r3
-	vld1.32  {d7}, [r5], r3
-	vld1.32  {d8}, [r5], r3
-	vld1.32  {d9}, [r5], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    vld1.32  {d6}, [r5], r3
+    vld1.32  {d7}, [r5], r3
+    vld1.32  {d8}, [r5], r3
+    vld1.32  {d9}, [r5], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
 
-	//Get the data from stack
-	ldr r5, [sp, #84] //the addr of Best_mode
-	ldr r6, [sp, #88] //the value of i_lambda
+    //Get the data from stack
+    ldr r5, [sp, #84] //the addr of Best_mode
+    ldr r6, [sp, #88] //the value of i_lambda
 
-	vrshr.u16  d11, #1
-	vpaddl.u16 d11, d11
-	vpaddl.u32 d11, d11
-	vmov.u32   lr, d11[0]
-	add  lr, lr, r6, lsl #1
+    vrshr.u16  d11, #1
+    vpaddl.u16 d11, d11
+    vpaddl.u32 d11, d11
+    vmov.u32   lr, d11[0]
+    add  lr, lr, r6, lsl #1
 
-	vrshr.u16  d10, #1
-	vpaddl.u16 d10, d10
-	vpaddl.u32 d10, d10
-	vmov.u32   r3, d10[0]
-	add  r3, r3, r6, lsl #1
+    vrshr.u16  d10, #1
+    vpaddl.u16 d10, d10
+    vpaddl.u32 d10, d10
+    vmov.u32   r3, d10[0]
+    add  r3, r3, r6, lsl #1
 
-	vrshr.u16  d28, #1
-	vpaddl.u16 d28, d28
-	vpaddl.u32 d28, d28
-	vmov.u32   r2, d28[0]
+    vrshr.u16  d28, #1
+    vpaddl.u16 d28, d28
+    vpaddl.u32 d28, d28
+    vmov.u32   r2, d28[0]
 
     mov r6, #2
     cmp r3, lr
@@ -671,8 +671,8 @@
     str r6, [r5]
     mov r0, lr
 
-	vpop {q4-q7}
-	ldmia sp!, {r4-r7, lr}
+    vpop {q4-q7}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 
@@ -680,118 +680,118 @@
     stmdb sp!, {r4-r7, lr}
 
     //Get the top line data to 'd31[0~3]'(4 bytes)
-	sub  r7, r0, r1
+    sub  r7, r0, r1
     vld1.32 {d31[0]}, [r7]
 
-	//Get the left colume data to 'd31[4~7]' (4 bytes)
-	sub  r7, r0, #1
+    //Get the left colume data to 'd31[4~7]' (4 bytes)
+    sub  r7, r0, #1
     vld1.8 {d31[4]}, [r7], r1
     vld1.8 {d31[5]}, [r7], r1
     vld1.8 {d31[6]}, [r7], r1
     vld1.8 {d31[7]}, [r7], r1
 
-	//Calculate the mean value and save to 'd30' (2 bytes)
-	vpaddl.u8 d0, d31
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
-	//Calculate the mean value
-	vrshr.u16  d0, #3
-	vshl.u16   d30, d0, #4
+    //Calculate the mean value and save to 'd30' (2 bytes)
+    vpaddl.u8 d0, d31
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
+    //Calculate the mean value
+    vrshr.u16  d0, #3
+    vshl.u16   d30, d0, #4
 
-	//Calculate the 16x16_v mode SATD and save to "d29"
+    //Calculate the 16x16_v mode SATD and save to "d29"
     //Calculate the 16x16_h mode SATD and save to "d28"
-	vshll.u8 q0, d31, #2
-	vtrn.32  d0, d1
-	vadd.s16 d2, d0, d1
-	vsub.s16 d1, d0, d1
-	vtrn.16  d2, d1
-	vadd.s16 d29, d2, d1
-	vsub.s16 d28, d2, d1
-	vtrn.32  d29, d28 //{0,1,3,2 top} d29
-	                  //{0,1,3,2 left} d28
+    vshll.u8 q0, d31, #2
+    vtrn.32  d0, d1
+    vadd.s16 d2, d0, d1
+    vsub.s16 d1, d0, d1
+    vtrn.16  d2, d1
+    vadd.s16 d29, d2, d1
+    vsub.s16 d28, d2, d1
+    vtrn.32  d29, d28 //{0,1,3,2 top} d29
+                      //{0,1,3,2 left} d28
 
     vmov.i32 d27, #0//Save the SATD of DC_BOTH
-	vmov.i32 d26, #0//Save the SATD of H
-	vmov.i32 d25, #0//Save the SATD of V
-	vmov.i32 d24, #0//For zero D register
+    vmov.i32 d26, #0//Save the SATD of H
+    vmov.i32 d25, #0//Save the SATD of V
+    vmov.i32 d24, #0//For zero D register
 
-	//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
-	vld1.32  {d23[0]}, [r2], r3
-	vld1.32  {d23[1]}, [r2], r3
-	vld1.32  {d22[0]}, [r2], r3
-	vld1.32  {d22[1]}, [r2], r3
+    //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
+    vld1.32  {d23[0]}, [r2], r3
+    vld1.32  {d23[1]}, [r2], r3
+    vld1.32  {d22[0]}, [r2], r3
+    vld1.32  {d22[1]}, [r2], r3
 
     HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
 
-	//Get the data from stack
-	ldr r5, [sp, #28] //the value of lambda2
-	ldr r6, [sp, #32] //the value of lambda1
-	ldr r7, [sp, #36] //the value of lambda0
+    //Get the data from stack
+    ldr r5, [sp, #28] //the value of lambda2
+    ldr r6, [sp, #32] //the value of lambda1
+    ldr r7, [sp, #36] //the value of lambda0
 
-	vrshr.u16  d25, #1
-	vpaddl.u16 d25, d25
-	vpaddl.u32 d25, d25
-	vmov.u32   r0, d25[0]
-	add  r0, r7
+    vrshr.u16  d25, #1
+    vpaddl.u16 d25, d25
+    vpaddl.u32 d25, d25
+    vmov.u32   r0, d25[0]
+    add  r0, r7
 
-	vrshr.u16  d26, #1
-	vpaddl.u16 d26, d26
-	vpaddl.u32 d26, d26
-	vmov.u32   r1, d26[0]
-	add  r1, r6
+    vrshr.u16  d26, #1
+    vpaddl.u16 d26, d26
+    vpaddl.u32 d26, d26
+    vmov.u32   r1, d26[0]
+    add  r1, r6
 
-	vrshr.u16  d27, #1
-	vpaddl.u16 d27, d27
-	vpaddl.u32 d27, d27
-	vmov.u32   r2, d27[0]
-	add  r2, r5
+    vrshr.u16  d27, #1
+    vpaddl.u16 d27, d27
+    vpaddl.u32 d27, d27
+    vmov.u32   r2, d27[0]
+    add  r2, r5
 
-	ldr r5, [sp, #20] //p_dst
-	ldr r6, [sp, #24] //the addr of Best_mode
+    ldr r5, [sp, #20] //p_dst
+    ldr r6, [sp, #24] //the addr of Best_mode
 
-	mov r4, r0
-	cmp r1, r4
-	movcc r4, r1
-	cmp r2, r4
-	movcc r4, r2
+    mov r4, r0
+    cmp r1, r4
+    movcc r4, r1
+    cmp r2, r4
+    movcc r4, r2
 
-	//The compare sequence affect the resule
-	cmp r4, r2
-	bne satd_intra_4x4_x3_opt_jump0
-	mov r0, #2
-	str r0, [r6]
-	vshr.u32  d0, d30, #4 // {2cb, 2cr} q9
-	vdup.8 q1, d0[0]
-	vst1.8 {q1}, [r5]
-	//...
-	bl satd_intra_4x4_x3_opt_end
+    //The compare sequence affect the resule
+    cmp r4, r2
+    bne satd_intra_4x4_x3_opt_jump0
+    mov r0, #2
+    str r0, [r6]
+    vshr.u32  d0, d30, #4 // {2cb, 2cr} q9
+    vdup.8 q1, d0[0]
+    vst1.8 {q1}, [r5]
+    //...
+    bl satd_intra_4x4_x3_opt_end
 satd_intra_4x4_x3_opt_jump0:
 
-	cmp r4, r1
-	bne satd_intra_4x4_x3_opt_jump1
-	mov r0, #1
-	str r0, [r6]
-	vdup.8 d0, d31[4]
-	vdup.8 d1, d31[5]
-	vdup.8 d2, d31[6]
-	vdup.8 d3, d31[7]
-	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
+    cmp r4, r1
+    bne satd_intra_4x4_x3_opt_jump1
+    mov r0, #1
+    str r0, [r6]
+    vdup.8 d0, d31[4]
+    vdup.8 d1, d31[5]
+    vdup.8 d2, d31[6]
+    vdup.8 d3, d31[7]
+    vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
 
-	bl satd_intra_4x4_x3_opt_end
+    bl satd_intra_4x4_x3_opt_end
 satd_intra_4x4_x3_opt_jump1:
 
-	mov r0, #0
-	str r0, [r6]
-	vst1.32 {d31[0]}, [r5]!
-	vst1.32 {d31[0]}, [r5]!
-	vst1.32 {d31[0]}, [r5]!
-	vst1.32 {d31[0]}, [r5]!
+    mov r0, #0
+    str r0, [r6]
+    vst1.32 {d31[0]}, [r5]!
+    vst1.32 {d31[0]}, [r5]!
+    vst1.32 {d31[0]}, [r5]!
+    vst1.32 {d31[0]}, [r5]!
 
 
 satd_intra_4x4_x3_opt_end:
-	mov r0, r4
+    mov r0, r4
 
-	ldmia sp!, {r4-r7, lr}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -66,10 +66,10 @@
     vsub.s16    q3, q12, q13
 
     vadd.s16    q8, q10, q11
-    vsub.s16	q9, q10, q11
+    vsub.s16    q9, q10, q11
 
     vadd.s16    q10, q14, q15
-    vsub.s16	q11, q14, q15
+    vsub.s16    q11, q14, q15
 
     vadd.s16    q12, q0, q2
     vsub.s16    q14, q0, q2
@@ -372,28 +372,28 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
     stmdb sp!, {r4-r5, lr}
 
-	//Loading a horizontal line data (4 bytes)
-	//line 0
-	ldr r4, [r0], r1
-	ldr r5, [r2], r3
-	usad8  lr, r4, r5
+    //Loading a horizontal line data (4 bytes)
+    //line 0
+    ldr r4, [r0], r1
+    ldr r5, [r2], r3
+    usad8  lr, r4, r5
 
     //line 1
-	ldr r4, [r0], r1
-	ldr r5, [r2], r3
-	usada8  lr, r4, r5, lr
+    ldr r4, [r0], r1
+    ldr r5, [r2], r3
+    usada8  lr, r4, r5, lr
 
     //line 2
-	ldr r4, [r0], r1
-	ldr r5, [r2], r3
-	usada8  lr, r4, r5, lr
+    ldr r4, [r0], r1
+    ldr r5, [r2], r3
+    usada8  lr, r4, r5, lr
 
-	//line 3
-	ldr r4, [r0]
-	ldr r5, [r2]
-	usada8  r0, r4, r5, lr
+    //line 3
+    ldr r4, [r0]
+    ldr r5, [r2]
+    usada8  r0, r4, r5, lr
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
@@ -401,76 +401,76 @@
 
     stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
+    vld1.8 {q0}, [r0], r1 //save pix1
 
-	vld1.8 {q1}, [r2], r3 //save pix2 - stride
-	vld1.8 {q10}, [r2], r3 //save pix2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vld1.8 {q1}, [r2], r3 //save pix2 - stride
+    vld1.8 {q10}, [r2], r3 //save pix2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 16 bytes
-	vabdl.u8  q15, d0, d2
-	vabal.u8  q15, d1, d3
+    //Do the SAD for 16 bytes
+    vabdl.u8  q15, d0, d2
+    vabal.u8  q15, d1, d3
 
-	vabdl.u8  q13, d0, d4
-	vabal.u8  q13, d1, d5
+    vabdl.u8  q13, d0, d4
+    vabal.u8  q13, d1, d5
 
-	vabdl.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabdl.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabdl.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabdl.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	mov lr, #15
+    mov lr, #15
 pixel_sad_4_16x16_loop_0:
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
-	vmov.8 q1,   q10      //save pix2 - stride
-	vmov.8 q10,  q2
-	vabal.u8  q15, d0, d2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
-	vabal.u8  q15, d1, d3
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q13, d0, d4
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q0}, [r0], r1 //save pix1
+    vmov.8 q1,   q10      //save pix2 - stride
+    vmov.8 q10,  q2
+    vabal.u8  q15, d0, d2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vabal.u8  q15, d1, d3
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q13, d0, d4
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
     vabal.u8  q13, d1, d5
-	subs lr, #1
+    subs lr, #1
 
-	vabal.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabal.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabal.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabal.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	bne pixel_sad_4_16x16_loop_0
+    bne pixel_sad_4_16x16_loop_0
 
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d26, d27
-	vadd.u16   d2, d22, d23
-	vadd.u16   d3, d18, d19
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d26, d27
+    vadd.u16   d2, d22, d23
+    vadd.u16   d3, d18, d19
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
@@ -477,75 +477,75 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
     stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
+    vld1.8 {q0}, [r0], r1 //save pix1
 
-	vld1.8 {q1}, [r2], r3 //save pix2 - stride
-	vld1.8 {q10}, [r2], r3 //save pix2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vld1.8 {q1}, [r2], r3 //save pix2 - stride
+    vld1.8 {q10}, [r2], r3 //save pix2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 16 bytes
-	vabdl.u8  q15, d0, d2
-	vabal.u8  q15, d1, d3
+    //Do the SAD for 16 bytes
+    vabdl.u8  q15, d0, d2
+    vabal.u8  q15, d1, d3
 
-	vabdl.u8  q13, d0, d4
-	vabal.u8  q13, d1, d5
+    vabdl.u8  q13, d0, d4
+    vabal.u8  q13, d1, d5
 
-	vabdl.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabdl.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabdl.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabdl.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	mov lr, #7
+    mov lr, #7
 pixel_sad_4_16x8_loop_0:
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
-	vmov.8 q1,   q10      //save pix2 - stride
-	vmov.8 q10,  q2
-	vabal.u8  q15, d0, d2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
-	vabal.u8  q15, d1, d3
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q13, d0, d4
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q0}, [r0], r1 //save pix1
+    vmov.8 q1,   q10      //save pix2 - stride
+    vmov.8 q10,  q2
+    vabal.u8  q15, d0, d2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vabal.u8  q15, d1, d3
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q13, d0, d4
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
     vabal.u8  q13, d1, d5
-	subs lr, #1
+    subs lr, #1
 
-	vabal.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabal.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabal.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabal.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	bne pixel_sad_4_16x8_loop_0
+    bne pixel_sad_4_16x8_loop_0
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d26, d27
-	vadd.u16   d2, d22, d23
-	vadd.u16   d3, d18, d19
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d26, d27
+    vadd.u16   d2, d22, d23
+    vadd.u16   d3, d18, d19
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
@@ -552,189 +552,189 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
     stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
+    vld1.8 {d0}, [r0], r1 //save pix1
 
-	vld1.8 {d1}, [r2], r3 //save pix2 - stride
-	vld1.8 {d6}, [r2], r3 //save pix2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d1}, [r2], r3 //save pix2 - stride
+    vld1.8 {d6}, [r2], r3 //save pix2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 8 bytes
-	vabdl.u8  q15, d0, d1
-	vabdl.u8  q14, d0, d2
-	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d0, d4
+    //Do the SAD for 8 bytes
+    vabdl.u8  q15, d0, d1
+    vabdl.u8  q14, d0, d2
+    vabdl.u8  q13, d0, d3
+    vabdl.u8  q12, d0, d4
 
-	mov lr, #15
+    mov lr, #15
 pixel_sad_4_8x16_loop_0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
-	vmov.8 d1,   d6       //save pix2 - stride
-	vmov.8 d6,   d2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q15, d0, d1
+    vld1.8 {d0}, [r0], r1 //save pix1
+    vmov.8 d1,   d6       //save pix2 - stride
+    vmov.8 d6,   d2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q15, d0, d1
 
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
-	//Do the SAD for 8 bytes
-	vabal.u8  q14, d0, d2
-	vabal.u8  q13, d0, d3
-	vabal.u8  q12, d0, d4
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    //Do the SAD for 8 bytes
+    vabal.u8  q14, d0, d2
+    vabal.u8  q13, d0, d3
+    vabal.u8  q12, d0, d4
     subs lr, #1
 
-	bne pixel_sad_4_8x16_loop_0
+    bne pixel_sad_4_8x16_loop_0
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d28, d29
-	vadd.u16   d2, d26, d27
-	vadd.u16   d3, d24, d25
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d28, d29
+    vadd.u16   d2, d26, d27
+    vadd.u16   d3, d24, d25
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
-	stmdb sp!, {r4-r5, lr}
+    stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
+    vld1.8 {d0}, [r0], r1 //save pix1
 
-	vld1.8 {d1}, [r2], r3 //save pix2 - stride
-	vld1.8 {d6}, [r2], r3 //save pix2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d1}, [r2], r3 //save pix2 - stride
+    vld1.8 {d6}, [r2], r3 //save pix2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 8 bytes
-	vabdl.u8  q15, d0, d1
-	vabdl.u8  q14, d0, d2
-	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d0, d4
+    //Do the SAD for 8 bytes
+    vabdl.u8  q15, d0, d1
+    vabdl.u8  q14, d0, d2
+    vabdl.u8  q13, d0, d3
+    vabdl.u8  q12, d0, d4
 
-	mov lr, #7
+    mov lr, #7
 pixel_sad_4_8x8_loop_0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
-	vmov.8 d1,   d6       //save pix2 - stride
-	vmov.8 d6,   d2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q15, d0, d1
+    vld1.8 {d0}, [r0], r1 //save pix1
+    vmov.8 d1,   d6       //save pix2 - stride
+    vmov.8 d6,   d2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q15, d0, d1
 
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
-	//Do the SAD for 8 bytes
-	vabal.u8  q14, d0, d2
-	vabal.u8  q13, d0, d3
-	vabal.u8  q12, d0, d4
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    //Do the SAD for 8 bytes
+    vabal.u8  q14, d0, d2
+    vabal.u8  q13, d0, d3
+    vabal.u8  q12, d0, d4
     subs lr, #1
-	bne pixel_sad_4_8x8_loop_0
+    bne pixel_sad_4_8x8_loop_0
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d28, d29
-	vadd.u16   d2, d26, d27
-	vadd.u16   d3, d24, d25
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d28, d29
+    vadd.u16   d2, d26, d27
+    vadd.u16   d3, d24, d25
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
 
-	vld1.32  {d0[0]}, [r0], r1
-	vld1.32  {d0[1]}, [r0], r1
-	vld1.32  {d1[0]}, [r0], r1
-	vld1.32  {d1[1]}, [r0]
+    vld1.32  {d0[0]}, [r0], r1
+    vld1.32  {d0[1]}, [r0], r1
+    vld1.32  {d1[0]}, [r0], r1
+    vld1.32  {d1[1]}, [r0]
 
 
-	sub   r0, r2, r3
-	vld1.32  {d2[0]}, [r0], r3
-	vld1.32  {d2[1]}, [r0], r3
-	vld1.32  {d3[0]}, [r0], r3
-	vld1.32  {d3[1]}, [r0], r3
-	vld1.32  {d4[0]}, [r0], r3
-	vld1.32  {d4[1]}, [r0]
+    sub   r0, r2, r3
+    vld1.32  {d2[0]}, [r0], r3
+    vld1.32  {d2[1]}, [r0], r3
+    vld1.32  {d3[0]}, [r0], r3
+    vld1.32  {d3[1]}, [r0], r3
+    vld1.32  {d4[0]}, [r0], r3
+    vld1.32  {d4[1]}, [r0]
 
-	sub   r0,  r2, #1
-	vld1.32  {d5[0]}, [r0], r3
-	vld1.32  {d5[1]}, [r0], r3
-	vld1.32  {d6[0]}, [r0], r3
-	vld1.32  {d6[1]}, [r0]
+    sub   r0,  r2, #1
+    vld1.32  {d5[0]}, [r0], r3
+    vld1.32  {d5[1]}, [r0], r3
+    vld1.32  {d6[0]}, [r0], r3
+    vld1.32  {d6[1]}, [r0]
 
-	add   r0,  r2, #1
-	vld1.32  {d7[0]}, [r0], r3
-	vld1.32  {d7[1]}, [r0], r3
-	vld1.32  {d8[0]}, [r0], r3
-	vld1.32  {d8[1]}, [r0]
+    add   r0,  r2, #1
+    vld1.32  {d7[0]}, [r0], r3
+    vld1.32  {d7[1]}, [r0], r3
+    vld1.32  {d8[0]}, [r0], r3
+    vld1.32  {d8[1]}, [r0]
 
-	vabdl.u8  q15, d0, d2
-	vabdl.u8  q14, d1, d3
+    vabdl.u8  q15, d0, d2
+    vabdl.u8  q14, d1, d3
 
-	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d1, d4
+    vabdl.u8  q13, d0, d3
+    vabdl.u8  q12, d1, d4
 
-	vabdl.u8  q11, d0, d5
-	vabdl.u8  q10, d1, d6
+    vabdl.u8  q11, d0, d5
+    vabdl.u8  q10, d1, d6
 
-	vabdl.u8  q9, d0, d7
-	vabdl.u8  q8, d1, d8
+    vabdl.u8  q9, d0, d7
+    vabdl.u8  q8, d1, d8
 
-	//Save SAD to 'r4'
-	ldr   r0, [sp]
-	vadd.u16   q0, q14, q15
-	vadd.u16   q1, q12, q13
-	vadd.u16   q2, q10, q11
-	vadd.u16   q3, q8 , q9
+    //Save SAD to 'r4'
+    ldr   r0, [sp]
+    vadd.u16   q0, q14, q15
+    vadd.u16   q1, q12, q13
+    vadd.u16   q2, q10, q11
+    vadd.u16   q3, q8 , q9
 
-	vadd.u16   d0, d1
-	vadd.u16   d1, d2, d3
-	vadd.u16   d2, d4, d5
-	vadd.u16   d3, d6, d7
+    vadd.u16   d0, d1
+    vadd.u16   d1, d2, d3
+    vadd.u16   d2, d4, d5
+    vadd.u16   d3, d6, d7
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -834,16 +834,16 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
 
     //Load the pix1 data --- 16 bytes
-	vld1.32  {d0[0]}, [r0], r1
-	vld1.32  {d0[1]}, [r0], r1
-	vld1.32  {d1[0]}, [r0], r1
-	vld1.32  {d1[1]}, [r0]
+    vld1.32  {d0[0]}, [r0], r1
+    vld1.32  {d0[1]}, [r0], r1
+    vld1.32  {d1[0]}, [r0], r1
+    vld1.32  {d1[1]}, [r0]
 
     //Load the pix2 data --- 16 bytes
-	vld1.32  {d2[0]}, [r2], r3
-	vld1.32  {d2[1]}, [r2], r3
-	vld1.32  {d3[0]}, [r2], r3
-	vld1.32  {d3[1]}, [r2]
+    vld1.32  {d2[0]}, [r2], r3
+    vld1.32  {d2[1]}, [r2], r3
+    vld1.32  {d3[0]}, [r2], r3
+    vld1.32  {d3[1]}, [r2]
 
     //Get the difference
     vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@@ -874,7 +874,7 @@
     vpaddl.u16 d0, d0
     vpaddl.u32 d0, d0
 
-	vmov.u32   r0, d0[0]
+    vmov.u32   r0, d0[0]
 
 WELS_ASM_FUNC_END
 
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -36,591 +36,591 @@
 
 #ifdef __APPLE__
 .macro LOAD_4x4_DATA_FOR_DCT
-//	{	//	input: $0~$3, src1*, src1_stride, src2*, src2_stride
-    vld2.16	{$0[0],$1[0]}, [$4], $5
-    vld2.16	{$2[0],$3[0]}, [$6], $7
-    vld2.16	{$0[1],$1[1]}, [$4], $5
-    vld2.16	{$2[1],$3[1]}, [$6], $7
+//  {   //  input: $0~$3, src1*, src1_stride, src2*, src2_stride
+    vld2.16 {$0[0],$1[0]}, [$4], $5
+    vld2.16 {$2[0],$3[0]}, [$6], $7
+    vld2.16 {$0[1],$1[1]}, [$4], $5
+    vld2.16 {$2[1],$3[1]}, [$6], $7
 
-    vld2.16	{$0[2],$1[2]}, [$4], $5
-    vld2.16	{$2[2],$3[2]}, [$6], $7
-    vld2.16	{$0[3],$1[3]}, [$4], $5
-    vld2.16	{$2[3],$3[3]}, [$6], $7
-//	}
+    vld2.16 {$0[2],$1[2]}, [$4], $5
+    vld2.16 {$2[2],$3[2]}, [$6], $7
+    vld2.16 {$0[3],$1[3]}, [$4], $5
+    vld2.16 {$2[3],$3[3]}, [$6], $7
+//  }
 .endm
 
 .macro LOAD_8x8_DATA_FOR_DCT
-//	{	//	input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-    vld1.64	{$0}, [$8], r2
-    vld1.64	{$4}, [$9], r4
-    vld1.64	{$1}, [$8], r2
-    vld1.64	{$5}, [$9], r4
+//  {   //  input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+    vld1.64 {$0}, [$8], r2
+    vld1.64 {$4}, [$9], r4
+    vld1.64 {$1}, [$8], r2
+    vld1.64 {$5}, [$9], r4
 
-    vld1.64	{$2}, [$8], r2
-    vld1.64	{$6}, [$9], r4
-    vld1.64	{$3}, [$8], r2
-    vld1.64	{$7}, [$9], r4
-//	}
+    vld1.64 {$2}, [$8], r2
+    vld1.64 {$6}, [$9], r4
+    vld1.64 {$3}, [$8], r2
+    vld1.64 {$7}, [$9], r4
+//  }
 .endm
 
 .macro DCT_ROW_TRANSFORM_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], working: [4]~[7]
-    vadd.s16		$4, $0, $3			//int16 s[0] = data[i] + data[i3];
-    vsub.s16		$7, $0, $3			//int16 s[3] = data[i] - data[i3];
-    vadd.s16		$5, $1, $2			//int16 s[1] = data[i1] + data[i2];
-    vsub.s16		$6, $1, $2			//int16 s[2] = data[i1] - data[i2];
+//  {   //  input: src_d[0]~[3], working: [4]~[7]
+    vadd.s16        $4, $0, $3          //int16 s[0] = data[i] + data[i3];
+    vsub.s16        $7, $0, $3          //int16 s[3] = data[i] - data[i3];
+    vadd.s16        $5, $1, $2          //int16 s[1] = data[i1] + data[i2];
+    vsub.s16        $6, $1, $2          //int16 s[2] = data[i1] - data[i2];
 
-    vadd.s16		$0, $4, $5			//int16 dct[i ] = s[0] + s[1];
-    vsub.s16		$2, $4, $5			//int16 dct[i2] = s[0] - s[1];
-    vshl.s16		$1, $7, #1
-    vshl.s16		$3, $6, #1
-    vadd.s16		$1, $1, $6			//int16 dct[i1] = (s[3] << 1) + s[2];
-    vsub.s16		$3, $7, $3			//int16 dct[i3] = s[3] - (s[2] << 1);
-//	}
+    vadd.s16        $0, $4, $5          //int16 dct[i ] = s[0] + s[1];
+    vsub.s16        $2, $4, $5          //int16 dct[i2] = s[0] - s[1];
+    vshl.s16        $1, $7, #1
+    vshl.s16        $3, $6, #1
+    vadd.s16        $1, $1, $6          //int16 dct[i1] = (s[3] << 1) + s[2];
+    vsub.s16        $3, $7, $3          //int16 dct[i3] = s[3] - (s[2] << 1);
+//  }
 .endm
 
 .macro MATRIX_TRANSFORM_EACH_16BITS
-//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-    vtrn.s16		$0, $1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-    vtrn.s16		$2, $3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-    vtrn.32		$0, $2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-    vtrn.32		$1, $3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//	}
+//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+    vtrn.s16        $0, $1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s16        $2, $3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vtrn.32     $0, $2              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vtrn.32     $1, $3              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+//  }
 .endm
 
-.macro NEWQUANT_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-    veor.s16		$6, $6			// init 0 , and keep 0;
-    vaba.s16		$1, $0, $6		// f + abs(coef - 0)
-    vmull.s16		$7, $2, $4
-    vmull.s16		$8, $3, $5
-    vshr.s32		$7, #16
-    vshr.s32		$8, #16
-    vmovn.s32		$2, $7
-    vmovn.s32		$3, $8
+.macro NEWQUANT_COEF_EACH_16BITS    // if coef <= 0, - coef; else , coef;
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+    veor.s16        $6, $6          // init 0 , and keep 0;
+    vaba.s16        $1, $0, $6      // f + abs(coef - 0)
+    vmull.s16       $7, $2, $4
+    vmull.s16       $8, $3, $5
+    vshr.s32        $7, #16
+    vshr.s32        $8, #16
+    vmovn.s32       $2, $7
+    vmovn.s32       $3, $8
 
-    vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
-    vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		$6, #1
-    vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        $7, $0, #0      // if true, location of coef == 11111111
+    vbif.s16        $6, $1, $7      // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        $6, #1
+    vsub.s16        $1, $1, $6      // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro NEWQUANT_COEF_EACH_16BITS_MAX	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-    veor.s16		$6, $6			// init 0 , and keep 0;
-    vaba.s16		$1, $0, $6		// f + abs(coef - 0)
-    vmull.s16		$7, $2, $4
-    vmull.s16		$8, $3, $5
-    vshr.s32		$7, #16
-    vshr.s32		$8, #16
-    vmovn.s32		$2, $7
-    vmovn.s32		$3, $8
+.macro NEWQUANT_COEF_EACH_16BITS_MAX    // if coef <= 0, - coef; else , coef;
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+    veor.s16        $6, $6          // init 0 , and keep 0;
+    vaba.s16        $1, $0, $6      // f + abs(coef - 0)
+    vmull.s16       $7, $2, $4
+    vmull.s16       $8, $3, $5
+    vshr.s32        $7, #16
+    vshr.s32        $8, #16
+    vmovn.s32       $2, $7
+    vmovn.s32       $3, $8
 
-    vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
-    vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		$6, #1
-    vmax.s16		$9, $2, $3
-    vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        $7, $0, #0      // if true, location of coef == 11111111
+    vbif.s16        $6, $1, $7      // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        $6, #1
+    vmax.s16        $9, $2, $3
+    vsub.s16        $1, $1, $6      // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro QUANT_DUALWORD_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
-    vaba.s16		$1, $0, $3		// f + abs(coef - 0)
-    vmull.s16		$4, $1, $2		// *= mf
-    vshr.s32		$4, #16
-    vmovn.s32		$1, $4			// >> 16
+.macro QUANT_DUALWORD_COEF_EACH_16BITS  // if coef <= 0, - coef; else , coef;
+//  {   //  input:  coef, ff (dst), mf , working_d (all 0), working_q
+    vaba.s16        $1, $0, $3      // f + abs(coef - 0)
+    vmull.s16       $4, $1, $2      // *= mf
+    vshr.s32        $4, #16
+    vmovn.s32       $1, $4          // >> 16
 
-    vcgt.s16		$2, $0, #0		// if true, location of coef == 11111111
-    vbif.s16		$3, $1, $2		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		$3, #1
-    vsub.s16		$1, $1, $3		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        $2, $0, #0      // if true, location of coef == 11111111
+    vbif.s16        $3, $1, $2      // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        $3, #1
+    vsub.s16        $1, $1, $3      // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
 .macro DC_ZERO_COUNT_IN_DUALWORD
-//	{	//	input:	coef, dst_d, working_d (all 0x01)
-    vceq.s16	$1, $0, #0
-    vand.s16	$1, $2
-    vpadd.s16	$1, $1, $1
-    vpadd.s16	$1, $1, $1
-//	}
+//  {   //  input:  coef, dst_d, working_d (all 0x01)
+    vceq.s16    $1, $0, #0
+    vand.s16    $1, $2
+    vpadd.s16   $1, $1, $1
+    vpadd.s16   $1, $1, $1
+//  }
 .endm
 
 .macro SELECT_MAX_IN_ABS_COEF
-//	{	//	input:	coef_0, coef_1, max_q (identy to follow two)
-    vmax.s16		$2, $0, $1		// max 1st in $3 & max 2nd in $4
-    vpmax.s16		$3, $3, $4		// max 1st in $3[0][1] & max 2nd in $3[2][3]
-    vpmax.s16		$3, $3, $4		// max 1st in $3[0][1]
-//	}
+//  {   //  input:  coef_0, coef_1, max_q (identy to follow two)
+    vmax.s16        $2, $0, $1      // max 1st in $3 & max 2nd in $4
+    vpmax.s16       $3, $3, $4      // max 1st in $3[0][1] & max 2nd in $3[2][3]
+    vpmax.s16       $3, $3, $4      // max 1st in $3[0][1]
+//  }
 .endm
 
 .macro ZERO_COUNT_IN_2_QUARWORD
-//	{	//	input:	coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
-    vceq.s16	$0, #0
-    vceq.s16	$1, #0
-    vand.s16	$0, $2
-    vand.s16	$1, $2
+//  {   //  input:  coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
+    vceq.s16    $0, #0
+    vceq.s16    $1, #0
+    vand.s16    $0, $2
+    vand.s16    $1, $2
 
-    vpadd.s16	$3, $3, $5
-    vpadd.s16	$4, $4, $6
-    vpadd.s16	$3, $3, $4		// 8-->4
-    vpadd.s16	$3, $3, $3
-    vpadd.s16	$3, $3, $3
-//	}
+    vpadd.s16   $3, $3, $5
+    vpadd.s16   $4, $4, $6
+    vpadd.s16   $3, $3, $4      // 8-->4
+    vpadd.s16   $3, $3, $3
+    vpadd.s16   $3, $3, $3
+//  }
 .endm
 
 .macro HDM_QUANT_2x2_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], working_d, dst_d
-    vshr.s64	$1, $0, #32
-    vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-    vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-    vtrn.s16	$2, $1
-    vtrn.s32	$2, $1
-//	}
+//  {   //  input: src_d[0]~[3], working_d, dst_d
+    vshr.s64    $1, $0, #32
+    vadd.s16    $2, $0, $1      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+    vsub.s16    $1, $0, $1      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+    vtrn.s16    $2, $1
+    vtrn.s32    $2, $1
+//  }
 .endm
 
 .macro IHDM_4x4_TOTAL_16BITS
-//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-    vshr.s64	$1, $0, #32
-    vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-    vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-    vtrn.s16	$2, $1
-    vrev32.16	$1, $1
-    vtrn.s32	$2, $1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+    vshr.s64    $1, $0, #32
+    vadd.s16    $2, $0, $1      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+    vsub.s16    $1, $0, $1      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+    vtrn.s16    $2, $1
+    vrev32.16   $1, $1
+    vtrn.s32    $2, $1          // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
 
-    vrev64.16	$1, $2
-    vadd.s16	$0, $2, $1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-    vsub.s16	$1, $2, $1
-    vrev32.16	$1, $1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-    vtrn.s32	$0, $1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//	}
+    vrev64.16   $1, $2
+    vadd.s16    $0, $2, $1      // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+    vsub.s16    $1, $2, $1
+    vrev32.16   $1, $1          // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+    vtrn.s32    $0, $1          // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+//  }
 .endm
 
 .macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-    vmovl.u8		$4,$0
-    vmovl.u8		$5,$1
-    vadd.s16		$4,$2
-    vadd.s16		$5,$3
-    vqmovun.s16	$0,$4
-    vqmovun.s16	$1,$5
-//	}
+//  {   //  input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+    vmovl.u8        $4,$0
+    vmovl.u8        $5,$1
+    vadd.s16        $4,$2
+    vadd.s16        $5,$3
+    vqmovun.s16 $0,$4
+    vqmovun.s16 $1,$5
+//  }
 .endm
 
 .macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
-    vadd.s16		$4, $0, $2			//int16 e[i][0] = src[0] + src[2];
-    vsub.s16		$5, $0, $2			//int16 e[i][1] = src[0] - src[2];
-    vshr.s16		$6, $1, #1
-    vshr.s16		$7, $3, #1
-    vsub.s16		$6, $6, $3			//int16 e[i][2] = (src[1]>>1)-src[3];
-    vadd.s16		$7, $1, $7			//int16 e[i][3] = src[1] + (src[3]>>1);
-//	}
+//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
+    vadd.s16        $4, $0, $2          //int16 e[i][0] = src[0] + src[2];
+    vsub.s16        $5, $0, $2          //int16 e[i][1] = src[0] - src[2];
+    vshr.s16        $6, $1, #1
+    vshr.s16        $7, $3, #1
+    vsub.s16        $6, $6, $3          //int16 e[i][2] = (src[1]>>1)-src[3];
+    vadd.s16        $7, $1, $7          //int16 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro TRANSFORM_TOTAL_16BITS	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s16		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s16		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s16		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s16		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_TOTAL_16BITS   // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s16        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s16        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s16        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s16        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 
 .macro ROW_TRANSFORM_0_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
-    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-    vsubl.s16		$6, $1, $3			//int32 e[i][2] = src[1] - src[3];
-    vaddl.s16		$7, $1, $3			//int32 e[i][3] = src[1] + src[3];
-//	}
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
+    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
+    vsubl.s16       $6, $1, $3          //int32 e[i][2] = src[1] - src[3];
+    vaddl.s16       $7, $1, $3          //int32 e[i][3] = src[1] + src[3];
+//  }
 .endm
 
 .macro ROW_TRANSFORM_1_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		$8, $1, #1
-    vshr.s16		$9, $3, #1
-    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        $8, $1, #1
+    vshr.s16        $9, $3, #1
+    vsubl.s16       $6, $8, $3          //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       $7, $1, $9          //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro TRANSFORM_4BYTES	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 .macro COL_TRANSFORM_0_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vsub.s32		$6, $1, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		$7, $1, $3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
+    vsub.s32        $6, $1, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        $7, $1, $3          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 
 .macro COL_TRANSFORM_1_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		$6, $1, #1
-    vshr.s32		$7, $3, #1
-    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        $6, $1, #1
+    vshr.s32        $7, $3, #1
+    vsub.s32        $6, $6, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        $7, $1, $7          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 #else
 .macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
-    vld2.16	{\arg0[0],\arg1[0]}, [\arg4], \arg5
-    vld2.16	{\arg2[0],\arg3[0]}, [\arg6], \arg7
-    vld2.16	{\arg0[1],\arg1[1]}, [\arg4], \arg5
-    vld2.16	{\arg2[1],\arg3[1]}, [\arg6], \arg7
+//  {   //  input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
+    vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
+    vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
+    vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
+    vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
 
-    vld2.16	{\arg0[2],\arg1[2]}, [\arg4], \arg5
-    vld2.16	{\arg2[2],\arg3[2]}, [\arg6], \arg7
-    vld2.16	{\arg0[3],\arg1[3]}, [\arg4], \arg5
-    vld2.16	{\arg2[3],\arg3[3]}, [\arg6], \arg7
-//	}
+    vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
+    vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
+    vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
+    vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
+//  }
 .endm
 
 .macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-    vld1.64	{\arg0}, [\arg8], r2
-    vld1.64	{\arg4}, [\arg9], r4
-    vld1.64	{\arg1}, [\arg8], r2
-    vld1.64	{\arg5}, [\arg9], r4
+//  {   //  input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+    vld1.64 {\arg0}, [\arg8], r2
+    vld1.64 {\arg4}, [\arg9], r4
+    vld1.64 {\arg1}, [\arg8], r2
+    vld1.64 {\arg5}, [\arg9], r4
 
-    vld1.64	{\arg2}, [\arg8], r2
-    vld1.64	{\arg6}, [\arg9], r4
-    vld1.64	{\arg3}, [\arg8], r2
-    vld1.64	{\arg7}, [\arg9], r4
-//	}
+    vld1.64 {\arg2}, [\arg8], r2
+    vld1.64 {\arg6}, [\arg9], r4
+    vld1.64 {\arg3}, [\arg8], r2
+    vld1.64 {\arg7}, [\arg9], r4
+//  }
 .endm
 
 .macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], working: [4]~[7]
-    vadd.s16		\arg4, \arg0, \arg3			//int16 s[0] = data[i] + data[i3];
-    vsub.s16		\arg7, \arg0, \arg3			//int16 s[3] = data[i] - data[i3];
-    vadd.s16		\arg5, \arg1, \arg2			//int16 s[1] = data[i1] + data[i2];
-    vsub.s16		\arg6, \arg1, \arg2			//int16 s[2] = data[i1] - data[i2];
+//  {   //  input: src_d[0]~[3], working: [4]~[7]
+    vadd.s16        \arg4, \arg0, \arg3         //int16 s[0] = data[i] + data[i3];
+    vsub.s16        \arg7, \arg0, \arg3         //int16 s[3] = data[i] - data[i3];
+    vadd.s16        \arg5, \arg1, \arg2         //int16 s[1] = data[i1] + data[i2];
+    vsub.s16        \arg6, \arg1, \arg2         //int16 s[2] = data[i1] - data[i2];
 
-    vadd.s16		\arg0, \arg4, \arg5			//int16 dct[i ] = s[0] + s[1];
-    vsub.s16		\arg2, \arg4, \arg5			//int16 dct[i2] = s[0] - s[1];
-    vshl.s16		\arg1, \arg7, #1
-    vshl.s16		\arg3, \arg6, #1
-    vadd.s16		\arg1, \arg1, \arg6			//int16 dct[i1] = (s[3] << 1) + s[2];
-    vsub.s16		\arg3, \arg7, \arg3			//int16 dct[i3] = s[3] - (s[2] << 1);
-//	}
+    vadd.s16        \arg0, \arg4, \arg5         //int16 dct[i ] = s[0] + s[1];
+    vsub.s16        \arg2, \arg4, \arg5         //int16 dct[i2] = s[0] - s[1];
+    vshl.s16        \arg1, \arg7, #1
+    vshl.s16        \arg3, \arg6, #1
+    vadd.s16        \arg1, \arg1, \arg6         //int16 dct[i1] = (s[3] << 1) + s[2];
+    vsub.s16        \arg3, \arg7, \arg3         //int16 dct[i3] = s[3] - (s[2] << 1);
+//  }
 .endm
 
 .macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
-//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-    vtrn.s16		\arg0, \arg1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-    vtrn.s16		\arg2, \arg3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-    vtrn.32		\arg0, \arg2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-    vtrn.32		\arg1, \arg3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//	}
+//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+    vtrn.s16        \arg0, \arg1                //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s16        \arg2, \arg3                //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vtrn.32     \arg0, \arg2                //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vtrn.32     \arg1, \arg3                //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+//  }
 .endm
 
 .macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-    veor.s16		\arg6, \arg6			// init 0 , and keep 0;
-    vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
-    vmull.s16		\arg7, \arg2, \arg4
-    vmull.s16		\arg8, \arg3, \arg5
-    vshr.s32		\arg7, #16
-    vshr.s32		\arg8, #16
-    vmovn.s32		\arg2, \arg7
-    vmovn.s32		\arg3, \arg8
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+    veor.s16        \arg6, \arg6            // init 0 , and keep 0;
+    vaba.s16        \arg1, \arg0, \arg6     // f + abs(coef - 0)
+    vmull.s16       \arg7, \arg2, \arg4
+    vmull.s16       \arg8, \arg3, \arg5
+    vshr.s32        \arg7, #16
+    vshr.s32        \arg8, #16
+    vmovn.s32       \arg2, \arg7
+    vmovn.s32       \arg3, \arg8
 
-    vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
-    vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		\arg6, #1
-    vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        \arg7, \arg0, #0        // if true, location of coef == 11111111
+    vbif.s16        \arg6, \arg1, \arg7     // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        \arg6, #1
+    vsub.s16        \arg1, \arg1, \arg6     // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
 .macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-    veor.s16		\arg6, \arg6			// init 0 , and keep 0;
-    vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
-    vmull.s16		\arg7, \arg2, \arg4
-    vmull.s16		\arg8, \arg3, \arg5
-    vshr.s32		\arg7, #16
-    vshr.s32		\arg8, #16
-    vmovn.s32		\arg2, \arg7
-    vmovn.s32		\arg3, \arg8
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+    veor.s16        \arg6, \arg6            // init 0 , and keep 0;
+    vaba.s16        \arg1, \arg0, \arg6     // f + abs(coef - 0)
+    vmull.s16       \arg7, \arg2, \arg4
+    vmull.s16       \arg8, \arg3, \arg5
+    vshr.s32        \arg7, #16
+    vshr.s32        \arg8, #16
+    vmovn.s32       \arg2, \arg7
+    vmovn.s32       \arg3, \arg8
 
-    vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
-    vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		\arg6, #1
-    vmax.s16		\arg9, \arg2, \arg3
-    vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        \arg7, \arg0, #0        // if true, location of coef == 11111111
+    vbif.s16        \arg6, \arg1, \arg7     // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        \arg6, #1
+    vmax.s16        \arg9, \arg2, \arg3
+    vsub.s16        \arg1, \arg1, \arg6     // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
 .macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
-//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
-    vaba.s16		\arg1, \arg0, \arg3		// f + abs(coef - 0)
-    vmull.s16		\arg4, \arg1, \arg2		// *= mf
-    vshr.s32		\arg4, #16
-    vmovn.s32		\arg1, \arg4			// >> 16
+//  {   //  input:  coef, ff (dst), mf , working_d (all 0), working_q
+    vaba.s16        \arg1, \arg0, \arg3     // f + abs(coef - 0)
+    vmull.s16       \arg4, \arg1, \arg2     // *= mf
+    vshr.s32        \arg4, #16
+    vmovn.s32       \arg1, \arg4            // >> 16
 
-    vcgt.s16		\arg2, \arg0, #0		// if true, location of coef == 11111111
-    vbif.s16		\arg3, \arg1, \arg2		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		\arg3, #1
-    vsub.s16		\arg1, \arg1, \arg3		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        \arg2, \arg0, #0        // if true, location of coef == 11111111
+    vbif.s16        \arg3, \arg1, \arg2     // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        \arg3, #1
+    vsub.s16        \arg1, \arg1, \arg3     // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
 .macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
-//	{	//	input:	coef, dst_d, working_d (all 0x01)
-    vceq.s16	\arg1, \arg0, #0
-    vand.s16	\arg1, \arg2
-    vpadd.s16	\arg1, \arg1, \arg1
-    vpadd.s16	\arg1, \arg1, \arg1
-//	}
+//  {   //  input:  coef, dst_d, working_d (all 0x01)
+    vceq.s16    \arg1, \arg0, #0
+    vand.s16    \arg1, \arg2
+    vpadd.s16   \arg1, \arg1, \arg1
+    vpadd.s16   \arg1, \arg1, \arg1
+//  }
 .endm
 
 .macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
-//	{	//	input:	coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
-    vmax.s16		\arg2, \arg0, \arg1		// max 1st in \arg3 & max 2nd in \arg4
-    vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
-    vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1]
-//	}
+//  {   //  input:  coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
+    vmax.s16        \arg2, \arg0, \arg1     // max 1st in \arg3 & max 2nd in \arg4
+    vpmax.s16       \arg3, \arg3, \arg4     // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
+    vpmax.s16       \arg3, \arg3, \arg4     // max 1st in \arg3[0][1]
+//  }
 .endm
 
 .macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
-//	{	//	input:	coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
-    vceq.s16	\arg0, #0
-    vceq.s16	\arg1, #0
-    vand.s16	\arg0, \arg2
-    vand.s16	\arg1, \arg2
+//  {   //  input:  coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
+    vceq.s16    \arg0, #0
+    vceq.s16    \arg1, #0
+    vand.s16    \arg0, \arg2
+    vand.s16    \arg1, \arg2
 
-    vpadd.s16	\arg3, \arg3, \arg5
-    vpadd.s16	\arg4, \arg4, \arg6
-    vpadd.s16	\arg3, \arg3, \arg4		// 8-->4
-    vpadd.s16	\arg3, \arg3, \arg3
-    vpadd.s16	\arg3, \arg3, \arg3
-//	}
+    vpadd.s16   \arg3, \arg3, \arg5
+    vpadd.s16   \arg4, \arg4, \arg6
+    vpadd.s16   \arg3, \arg3, \arg4     // 8-->4
+    vpadd.s16   \arg3, \arg3, \arg3
+    vpadd.s16   \arg3, \arg3, \arg3
+//  }
 .endm
 
 .macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
-//	{	//	input: src_d[0]~[3], working_d, dst_d
-    vshr.s64	\arg1, \arg0, #32
-    vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-    vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-    vtrn.s16	\arg2, \arg1
-    vtrn.s32	\arg2, \arg1
-//	}
+//  {   //  input: src_d[0]~[3], working_d, dst_d
+    vshr.s64    \arg1, \arg0, #32
+    vadd.s16    \arg2, \arg0, \arg1     // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+    vsub.s16    \arg1, \arg0, \arg1     // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+    vtrn.s16    \arg2, \arg1
+    vtrn.s32    \arg2, \arg1
+//  }
 .endm
 
 .macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
-//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-    vshr.s64	\arg1, \arg0, #32
-    vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-    vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-    vtrn.s16	\arg2, \arg1
-    vrev32.16	\arg1, \arg1
-    vtrn.s32	\arg2, \arg1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+    vshr.s64    \arg1, \arg0, #32
+    vadd.s16    \arg2, \arg0, \arg1     // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+    vsub.s16    \arg1, \arg0, \arg1     // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+    vtrn.s16    \arg2, \arg1
+    vrev32.16   \arg1, \arg1
+    vtrn.s32    \arg2, \arg1            // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
 
-    vrev64.16	\arg1, \arg2
-    vadd.s16	\arg0, \arg2, \arg1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-    vsub.s16	\arg1, \arg2, \arg1
-    vrev32.16	\arg1, \arg1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-    vtrn.s32	\arg0, \arg1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//	}
+    vrev64.16   \arg1, \arg2
+    vadd.s16    \arg0, \arg2, \arg1     // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+    vsub.s16    \arg1, \arg2, \arg1
+    vrev32.16   \arg1, \arg1            // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+    vtrn.s32    \arg0, \arg1            // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+//  }
 .endm
 
 .macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-    vmovl.u8		\arg4,\arg0
-    vmovl.u8		\arg5,\arg1
-    vadd.s16		\arg4,\arg2
-    vadd.s16		\arg5,\arg3
-    vqmovun.s16	\arg0,\arg4
-    vqmovun.s16	\arg1,\arg5
-//	}
+//  {   //  input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+    vmovl.u8        \arg4,\arg0
+    vmovl.u8        \arg5,\arg1
+    vadd.s16        \arg4,\arg2
+    vadd.s16        \arg5,\arg3
+    vqmovun.s16 \arg0,\arg4
+    vqmovun.s16 \arg1,\arg5
+//  }
 .endm
 
 .macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
-    vadd.s16		\arg4, \arg0, \arg2			//int16 e[i][0] = src[0] + src[2];
-    vsub.s16		\arg5, \arg0, \arg2			//int16 e[i][1] = src[0] - src[2];
-    vshr.s16		\arg6, \arg1, #1
-    vshr.s16		\arg7, \arg3, #1
-    vsub.s16		\arg6, \arg6, \arg3			//int16 e[i][2] = (src[1]>>1)-src[3];
-    vadd.s16		\arg7, \arg1, \arg7			//int16 e[i][3] = src[1] + (src[3]>>1);
-//	}
+//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
+    vadd.s16        \arg4, \arg0, \arg2         //int16 e[i][0] = src[0] + src[2];
+    vsub.s16        \arg5, \arg0, \arg2         //int16 e[i][1] = src[0] - src[2];
+    vshr.s16        \arg6, \arg1, #1
+    vshr.s16        \arg7, \arg3, #1
+    vsub.s16        \arg6, \arg6, \arg3         //int16 e[i][2] = (src[1]>>1)-src[3];
+    vadd.s16        \arg7, \arg1, \arg7         //int16 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s16		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s16		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s16		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s16		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7    // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s16        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s16        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s16        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s16        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 
 .macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
-    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-    vsubl.s16		\arg6, \arg1, \arg3			//int32 e[i][2] = src[1] - src[3];
-    vaddl.s16		\arg7, \arg1, \arg3			//int32 e[i][3] = src[1] + src[3];
-//	}
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
+    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
+    vsubl.s16       \arg6, \arg1, \arg3         //int32 e[i][2] = src[1] - src[3];
+    vaddl.s16       \arg7, \arg1, \arg3         //int32 e[i][3] = src[1] + src[3];
+//  }
 .endm
 
 .macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
-    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		\arg8, \arg1, #1
-    vshr.s16		\arg9, \arg3, #1
-    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
+    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        \arg8, \arg1, #1
+    vshr.s16        \arg9, \arg3, #1
+    vsubl.s16       \arg6, \arg8, \arg3         //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       \arg7, \arg1, \arg9         //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7  // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 .macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vsub.s32		\arg6, \arg1, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		\arg7, \arg1, \arg3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
+    vsub.s32        \arg6, \arg1, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        \arg7, \arg1, \arg3         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 
 .macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		\arg6, \arg1, #1
-    vshr.s32		\arg7, \arg3, #1
-    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        \arg6, \arg1, #1
+    vshr.s32        \arg7, \arg3, #1
+    vsub.s32        \arg6, \arg6, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN WelsDctT4_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	LOAD_4x4_DATA_FOR_DCT	d4, d5, d6, d7, r1, r2, r3, r4
+    LOAD_4x4_DATA_FOR_DCT   d4, d5, d6, d7, r1, r2, r3, r4
 
-	vsubl.u8	q0, d4, d6
-	vsubl.u8	q1, d5, d7
-	vtrn.s32	q0, q1
-	vswp		d1, d2
+    vsubl.u8    q0, d4, d6
+    vsubl.u8    q1, d5, d7
+    vtrn.s32    q0, q1
+    vswp        d1, d2
 
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    // horizontal transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    //  vertical transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	vst1.s16		{q0, q1}, [r0]!
+    vst1.s16        {q0, q1}, [r0]!
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	LOAD_8x8_DATA_FOR_DCT	d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+    LOAD_8x8_DATA_FOR_DCT   d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
 
-	vsubl.u8	q0, d16, d20
-	vsubl.u8	q1, d17, d21
-	vsubl.u8	q2, d18, d22
-	vsubl.u8	q3, d19, d23
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    vsubl.u8    q0, d16, d20
+    vsubl.u8    q1, d17, d21
+    vsubl.u8    q2, d18, d22
+    vsubl.u8    q3, d19, d23
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    // horizontal transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    //  vertical transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	vswp		d1, d2
-	vswp		d5, d6
-	vswp		q1, q2
-	vst1.s16		{q0, q1}, [r0]!
-	vst1.s16		{q2, q3}, [r0]!
+    vswp        d1, d2
+    vswp        d5, d6
+    vswp        q1, q2
+    vst1.s16        {q0, q1}, [r0]!
+    vst1.s16        {q2, q3}, [r0]!
 
-	////////////////
-	LOAD_8x8_DATA_FOR_DCT	d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+    ////////////////
+    LOAD_8x8_DATA_FOR_DCT   d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
 
-	vsubl.u8	q0, d16, d20
-	vsubl.u8	q1, d17, d21
-	vsubl.u8	q2, d18, d22
-	vsubl.u8	q3, d19, d23
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    vsubl.u8    q0, d16, d20
+    vsubl.u8    q1, d17, d21
+    vsubl.u8    q2, d18, d22
+    vsubl.u8    q3, d19, d23
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    // horizontal transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    //  vertical transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	vswp		d1, d2
-	vswp		d5, d6
-	vswp		q1, q2
-	vst1.s16		{q0, q1}, [r0]!
-	vst1.s16		{q2, q3}, [r0]!
+    vswp        d1, d2
+    vswp        d5, d6
+    vswp        q1, q2
+    vst1.s16        {q0, q1}, [r0]!
+    vst1.s16        {q2, q3}, [r0]!
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q0, q1}, [r0]
-	vld1.s16		{q3}, [r2]
+    vld1.s16        {q2}, [r1]
+    vld1.s16        {q0, q1}, [r0]
+    vld1.s16        {q3}, [r2]
 
-	vmov			q8, q2
+    vmov            q8, q2
 
-	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q9, q10, q11
-	vst1.s16		{q2}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q0, q2, d4, d5, d6, d7, q9, q10, q11
+    vst1.s16        {q2}, [r0]!
 
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r0]!
 
 WELS_ASM_FUNC_END
 
@@ -627,266 +627,266 @@
 
 WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
 
-	vld1.s16		{q0, q1}, [r0]
-	vdup.s16		q2, r1		// even ff range [0, 768]
-	vdup.s16		q3, r2
+    vld1.s16        {q0, q1}, [r0]
+    vdup.s16        q2, r1      // even ff range [0, 768]
+    vdup.s16        q3, r2
 
-	vmov			q8, q2
+    vmov            q8, q2
 
-	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q9, q10, q11
-	vst1.s16		{q2}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q0, q2, d4, d5, d6, d7, q9, q10, q11
+    vst1.s16        {q2}, [r0]!
 
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r0]!
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q3}, [r2]
-	mov				r1, r0
+    vld1.s16        {q2}, [r1]
+    vld1.s16        {q3}, [r2]
+    mov             r1, r0
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q3}, [r2]
-	mov				r1, r0
+    vld1.s16        {q2}, [r1]
+    vld1.s16        {q3}, [r2]
+    mov             r1, r0
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
-	vst1.s16		{q12}, [r1]!		// then 1st 16 elem in d26 & d28
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+    vst1.s16        {q12}, [r1]!        // then 1st 16 elem in d26 & d28
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
-	vst1.s16		{q12}, [r1]!	// then 2nd 16 elem in d27 & d29
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+    vst1.s16        {q12}, [r1]!    // then 2nd 16 elem in d27 & d29
 
-	SELECT_MAX_IN_ABS_COEF	q13, q14, q0, d0, d1
-	vst1.s32		{d0[0]}, [r3]!
+    SELECT_MAX_IN_ABS_COEF  q13, q14, q0, d0, d1
+    vst1.s32        {d0[0]}, [r3]!
 
-	///////////
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
-	vst1.s16		{q12}, [r1]!		// then 3rd 16 elem in d26 & d28
+    ///////////
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+    vst1.s16        {q12}, [r1]!        // then 3rd 16 elem in d26 & d28
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
-	vst1.s16		{q12}, [r1]!	// then 4th 16 elem in d27 & d29
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+    vst1.s16        {q12}, [r1]!    // then 4th 16 elem in d27 & d29
 
-	SELECT_MAX_IN_ABS_COEF	q13, q14, q0, d0, d1
-	vst1.s32		{d0[0]}, [r3]!
+    SELECT_MAX_IN_ABS_COEF  q13, q14, q0, d0, d1
+    vst1.s32        {d0[0]}, [r3]!
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
-	push	{r2,r3}
-	mov		r2, #64	// 2*16*sizeof(int16_t)
-	add		r3, r1, #32
+    push    {r2,r3}
+    mov     r2, #64 // 2*16*sizeof(int16_t)
+    add     r3, r1, #32
 
-	vld1.s16		{d0}, [r1], r2
-	vld1.s16		{d1}, [r3], r2
-	vld1.s16		{d4}, [r1], r2
-	vld1.s16		{d5}, [r3], r2
-	vld1.s16		{d2}, [r1], r2
-	vld1.s16		{d3}, [r3], r2
-	vld1.s16		{d6}, [r1], r2
-	vld1.s16		{d7}, [r3], r2
-	vtrn.16		q0, q2		// d0[0 4], d1[1 5]
-	vtrn.16		q1, q3		// d2[2 6], d3[3 7]
+    vld1.s16        {d0}, [r1], r2
+    vld1.s16        {d1}, [r3], r2
+    vld1.s16        {d4}, [r1], r2
+    vld1.s16        {d5}, [r3], r2
+    vld1.s16        {d2}, [r1], r2
+    vld1.s16        {d3}, [r3], r2
+    vld1.s16        {d6}, [r1], r2
+    vld1.s16        {d7}, [r3], r2
+    vtrn.16     q0, q2      // d0[0 4], d1[1 5]
+    vtrn.16     q1, q3      // d2[2 6], d3[3 7]
 
-	vld1.s16		{d16}, [r1], r2
-	vld1.s16		{d17}, [r3], r2
-	vld1.s16		{d20}, [r1], r2
-	vld1.s16		{d21}, [r3], r2
-	vld1.s16		{d18}, [r1], r2
-	vld1.s16		{d19}, [r3], r2
-	vld1.s16		{d22}, [r1], r2
-	vld1.s16		{d23}, [r3], r2
-	vtrn.16		q8, q10		//d16[08 12],d17[09 13]
-	vtrn.16		q9, q11		//d18[10 14],d19[11 15]
+    vld1.s16        {d16}, [r1], r2
+    vld1.s16        {d17}, [r3], r2
+    vld1.s16        {d20}, [r1], r2
+    vld1.s16        {d21}, [r3], r2
+    vld1.s16        {d18}, [r1], r2
+    vld1.s16        {d19}, [r3], r2
+    vld1.s16        {d22}, [r1], r2
+    vld1.s16        {d23}, [r3], r2
+    vtrn.16     q8, q10     //d16[08 12],d17[09 13]
+    vtrn.16     q9, q11     //d18[10 14],d19[11 15]
 
-	vtrn.32		q0, q8		// d0 [0 4 08 12] = dct[idx],		d1[1 5 09 13] = dct[idx+16]
-	vtrn.32		q1, q9		// d2 [2 6 10 14] = dct[idx+64],	d3[3 7 11 15] = dct[idx+80]
+    vtrn.32     q0, q8      // d0 [0 4 08 12] = dct[idx],       d1[1 5 09 13] = dct[idx+16]
+    vtrn.32     q1, q9      // d2 [2 6 10 14] = dct[idx+64],    d3[3 7 11 15] = dct[idx+80]
 
-	ROW_TRANSFORM_0_STEP	d0, d1, d3, d2, q8, q11, q10, q9
+    ROW_TRANSFORM_0_STEP    d0, d1, d3, d2, q8, q11, q10, q9
 
-	TRANSFORM_4BYTES		q0, q1, q3, q2, q8, q11, q10, q9
+    TRANSFORM_4BYTES        q0, q1, q3, q2, q8, q11, q10, q9
 
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+    // transform element 32bits
+    vtrn.s32        q0, q1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s32        q2, q3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vswp            d1, d4              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vswp            d3, d6              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
 
-	COL_TRANSFORM_0_STEP	q0, q1, q3, q2, q8, q11, q10, q9
+    COL_TRANSFORM_0_STEP    q0, q1, q3, q2, q8, q11, q10, q9
 
-	TRANSFORM_4BYTES		q0, q1, q3, q2, q8, q11, q10, q9
+    TRANSFORM_4BYTES        q0, q1, q3, q2, q8, q11, q10, q9
 
-	vrshrn.s32		d16, q0, #1
-	vrshrn.s32		d17, q1, #1
-	vrshrn.s32		d18, q2, #1
-	vrshrn.s32		d19, q3, #1
-	vst1.16	{q8, q9}, [r0]	//store
+    vrshrn.s32      d16, q0, #1
+    vrshrn.s32      d17, q1, #1
+    vrshrn.s32      d18, q2, #1
+    vrshrn.s32      d19, q3, #1
+    vst1.16 {q8, q9}, [r0]  //store
 
-	pop		{r2,r3}
+    pop     {r2,r3}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
 
-	vdup.s16	d1, r1				//ff
-	vdup.s16	d2, r2				//mf
-	veor		d3, d3
+    vdup.s16    d1, r1              //ff
+    vdup.s16    d2, r2              //mf
+    veor        d3, d3
 
-	mov			r1, #32
-	mov			r2, r0
+    mov         r1, #32
+    mov         r2, r0
 
-	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[00]=0
-	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[16]=0
-	vld1.s16	{d0[2]}, [r0], r1		//rs[32]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[32]=0
-	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[48]=0
+    vld1.s16    {d0[0]}, [r0], r1       //rs[00]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[00]=0
+    vld1.s16    {d0[1]}, [r0], r1       //rs[16]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[16]=0
+    vld1.s16    {d0[2]}, [r0], r1       //rs[32]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[32]=0
+    vld1.s16    {d0[3]}, [r0], r1       //rs[48]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[48]=0
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d0, d4, d5		// output d5
+    HDM_QUANT_2x2_TOTAL_16BITS  d0, d4, d5      // output d5
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d5, d4, d0		// output d0
+    HDM_QUANT_2x2_TOTAL_16BITS  d5, d4, d0      // output d0
 
-	QUANT_DUALWORD_COEF_EACH_16BITS	d0, d1, d2, d3, q2
+    QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
 
-	vst1.s16	d1, [r3]		// store to dct
-	ldr			r2, [sp, #0]
-	vst1.s16	d1, [r2]		// store to block
+    vst1.s16    d1, [r3]        // store to dct
+    ldr         r2, [sp, #0]
+    vst1.s16    d1, [r2]        // store to block
 
-	mov			r1, #1
-	vdup.s16	d3, r1
-	DC_ZERO_COUNT_IN_DUALWORD	d1, d0, d3
+    mov         r1, #1
+    vdup.s16    d3, r1
+    DC_ZERO_COUNT_IN_DUALWORD   d1, d0, d3
 
-	vmov	r0, r1, d0
-	and		r0, #0x07		// range [0~4]
-	rsb		r0, #4
+    vmov    r0, r1, d0
+    and     r0, #0x07       // range [0~4]
+    rsb     r0, #4
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
 
-	vdup.s16	d3, r1
-	mov			r1, #32
-	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
-	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
-	vld1.s16	{d0[2]}, [r0], r1		//rs[32]
-	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
+    vdup.s16    d3, r1
+    mov         r1, #32
+    vld1.s16    {d0[0]}, [r0], r1       //rs[00]
+    vld1.s16    {d0[1]}, [r0], r1       //rs[16]
+    vld1.s16    {d0[2]}, [r0], r1       //rs[32]
+    vld1.s16    {d0[3]}, [r0], r1       //rs[48]
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d0, d1, d2		// output d2
+    HDM_QUANT_2x2_TOTAL_16BITS  d0, d1, d2      // output d2
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d2, d1, d0		// output d0
+    HDM_QUANT_2x2_TOTAL_16BITS  d2, d1, d0      // output d0
 
-	vabs.s16	d1, d0
-	vcgt.s16	d1, d1, d3		// abs(dct[i])>threshold;
-	vmov	r0, r1, d1
-	orr		r0, r1
+    vabs.s16    d1, d0
+    vcgt.s16    d1, d1, d3      // abs(dct[i])>threshold;
+    vmov    r0, r1, d1
+    orr     r0, r1
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
-	push	{r1}
-	vld1.s16	{q0, q1}, [r0]
-	vmov.s16	q8, #1
+    push    {r1}
+    vld1.s16    {q0, q1}, [r0]
+    vmov.s16    q8, #1
 
-	ZERO_COUNT_IN_2_QUARWORD	q0, q1, q8, d0, d1, d2, d3
-	vmov	r0, r1, d0
-	and		r0, #0x1F	// range [0~16]
-	rsb		r0, #16
-	pop		{r1}
+    ZERO_COUNT_IN_2_QUARWORD    q0, q1, q8, d0, d1, d2, d3
+    vmov    r0, r1, d0
+    and     r0, #0x1F   // range [0~16]
+    rsb     r0, #16
+    pop     {r1}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
-	vld1.s16	{q0, q1}, [r0]
-	vld1.u16	{q2}, [r1]
+    vld1.s16    {q0, q1}, [r0]
+    vld1.u16    {q2}, [r1]
 
-	vmul.s16	q8, q0, q2
-	vmul.s16	q9, q1, q2
+    vmul.s16    q8, q0, q2
+    vmul.s16    q9, q1, q2
 
-	vst1.s16	{q8, q9}, [r0]
+    vst1.s16    {q8, q9}, [r0]
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
-	vld1.u16	{q12}, [r1]
-	mov		r1, r0
-	vld1.s16	{q0, q1}, [r0]!
-	vld1.s16	{q2, q3}, [r0]!
-	vmul.s16	q0, q0, q12
-	vld1.s16	{q8, q9}, [r0]!
-	vmul.s16	q1, q1, q12
-	vld1.s16	{q10, q11}, [r0]!
+    vld1.u16    {q12}, [r1]
+    mov     r1, r0
+    vld1.s16    {q0, q1}, [r0]!
+    vld1.s16    {q2, q3}, [r0]!
+    vmul.s16    q0, q0, q12
+    vld1.s16    {q8, q9}, [r0]!
+    vmul.s16    q1, q1, q12
+    vld1.s16    {q10, q11}, [r0]!
 
-	vst1.s16	{q0, q1}, [r1]!
+    vst1.s16    {q0, q1}, [r1]!
 
-	vmul.s16	q2, q2, q12
-	vmul.s16	q3, q3, q12
-	vmul.s16	q8, q8, q12
-	vst1.s16	{q2, q3}, [r1]!
+    vmul.s16    q2, q2, q12
+    vmul.s16    q3, q3, q12
+    vmul.s16    q8, q8, q12
+    vst1.s16    {q2, q3}, [r1]!
 
-	vmul.s16	q9, q9, q12
-	vmul.s16	q10, q10, q12
-	vmul.s16	q11, q11, q12
-	vst1.s16	{q8, q9}, [r1]!
-	vst1.s16	{q10, q11}, [r1]!
+    vmul.s16    q9, q9, q12
+    vmul.s16    q10, q10, q12
+    vmul.s16    q11, q11, q12
+    vst1.s16    {q8, q9}, [r1]!
+    vst1.s16    {q10, q11}, [r1]!
 
 WELS_ASM_FUNC_END
 
@@ -893,258 +893,258 @@
 
 WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
 
-	vld1.s16	{q0, q1}, [r0]
-	vdup.s16	q8, r1
+    vld1.s16    {q0, q1}, [r0]
+    vdup.s16    q8, r1
 
-	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
-	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
+    IHDM_4x4_TOTAL_16BITS   q0, q2, q3
+    IHDM_4x4_TOTAL_16BITS   q1, q2, q3
 
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
-	vmul.s16	q0, q8
+    IHDM_4x4_TOTAL_16BITS   q0, q2, q3
+    vmul.s16    q0, q8
 
-	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
-	vmul.s16	q1, q8
+    IHDM_4x4_TOTAL_16BITS   q1, q2, q3
+    vmul.s16    q1, q8
 
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
-	vst1.s16	{q0, q1}, [r0]
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
+    vst1.s16    {q0, q1}, [r0]
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
-	vld1.u32		{d16[0]}, [r2], r3
-	push			{r4}
-	ldr				r4, [sp, #4]
-	vld1.u32		{d16[1]}, [r2], r3
+    vld1.u32        {d16[0]}, [r2], r3
+    push            {r4}
+    ldr             r4, [sp, #4]
+    vld1.u32        {d16[1]}, [r2], r3
 
-	vld4.s16		{d0, d1, d2, d3}, [r4]		// cost 3 cycles!
-	vld1.u32		{d17[0]}, [r2], r3
-	vld1.u32		{d17[1]}, [r2], r3			// q7 is pred
+    vld4.s16        {d0, d1, d2, d3}, [r4]      // cost 3 cycles!
+    vld1.u32        {d17[0]}, [r2], r3
+    vld1.u32        {d17[1]}, [r2], r3          // q7 is pred
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       d0, d1, d2, d3, d4, d5, d6, d7
 
-	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
 
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       d0, d1, d2, d3, d4, d5, d6, d7
 
-	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-	vrshr.s16		d0, d0, #6
-	vrshr.s16		d1, d1, #6
-	vrshr.s16		d2, d2, #6
-	vrshr.s16		d3, d3, #6
+    TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
+    vrshr.s16       d0, d0, #6
+    vrshr.s16       d1, d1, #6
+    vrshr.s16       d2, d2, #6
+    vrshr.s16       d3, d3, #6
 
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q2,d16
-	vadd.s16		q0,q2
-	vqmovun.s16	d16,q0
-	vst1.32		{d16[0]},[r0],r1
-	vst1.32		{d16[1]},[r0],r1
+    //after rounding 6, clip into [0, 255]
+    vmovl.u8        q2,d16
+    vadd.s16        q0,q2
+    vqmovun.s16 d16,q0
+    vst1.32     {d16[0]},[r0],r1
+    vst1.32     {d16[1]},[r0],r1
 
-	vmovl.u8		q2,d17
-	vadd.s16		q1,q2
-	vqmovun.s16	d17,q1
-	vst1.32		{d17[0]},[r0],r1
-	vst1.32		{d17[1]},[r0]
+    vmovl.u8        q2,d17
+    vadd.s16        q1,q2
+    vqmovun.s16 d17,q1
+    vst1.32     {d17[0]},[r0],r1
+    vst1.32     {d17[1]},[r0]
 
-	pop			{r4}
+    pop         {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
 
-	vld1.u64		{d24}, [r2], r3
-	push			{r4}
-	ldr				r4, [sp, #4]
-	vld1.u64		{d25}, [r2], r3
+    vld1.u64        {d24}, [r2], r3
+    push            {r4}
+    ldr             r4, [sp, #4]
+    vld1.u64        {d25}, [r2], r3
 
-	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!
-	vld1.u64		{d26}, [r2], r3
-	vld1.u64		{d27}, [r2], r3
-	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!
-	vswp			d1, d4
-	vswp			d3, d6
-	vswp			q1, q2						// q0~q3
+    vld4.s16        {d0, d1, d2, d3}, [r4]!     // cost 3 cycles!
+    vld1.u64        {d26}, [r2], r3
+    vld1.u64        {d27}, [r2], r3
+    vld4.s16        {d4, d5, d6, d7}, [r4]!     // cost 3 cycles!
+    vswp            d1, d4
+    vswp            d3, d6
+    vswp            q1, q2                      // q0~q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
-	vrshr.s16		q0, q0, #6
-	vrshr.s16		q1, q1, #6
-	vrshr.s16		q2, q2, #6
-	vrshr.s16		q3, q3, #6
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
+    vrshr.s16       q0, q0, #6
+    vrshr.s16       q1, q1, #6
+    vrshr.s16       q2, q2, #6
+    vrshr.s16       q3, q3, #6
 
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q8,d24
-	vadd.s16		q0,q8
-	vqmovun.s16	d24,q0
-	vst1.u8		{d24},[r0],r1
+    //after rounding 6, clip into [0, 255]
+    vmovl.u8        q8,d24
+    vadd.s16        q0,q8
+    vqmovun.s16 d24,q0
+    vst1.u8     {d24},[r0],r1
 
-	vmovl.u8		q8,d25
-	vadd.s16		q1,q8
-	vqmovun.s16	d25,q1
-	vst1.u8		{d25},[r0],r1
+    vmovl.u8        q8,d25
+    vadd.s16        q1,q8
+    vqmovun.s16 d25,q1
+    vst1.u8     {d25},[r0],r1
 
-	vmovl.u8		q8,d26
-	vadd.s16		q2,q8
-	vqmovun.s16	d26,q2
-	vst1.u8		{d26},[r0],r1
+    vmovl.u8        q8,d26
+    vadd.s16        q2,q8
+    vqmovun.s16 d26,q2
+    vst1.u8     {d26},[r0],r1
 
-	vmovl.u8		q8,d27
-	vadd.s16		q3,q8
-	vqmovun.s16	d27,q3
-	vst1.u8		{d27},[r0],r1
+    vmovl.u8        q8,d27
+    vadd.s16        q3,q8
+    vqmovun.s16 d27,q3
+    vst1.u8     {d27},[r0],r1
 
-	vld1.u64		{d24}, [r2], r3
-	vld1.u64		{d25}, [r2], r3
+    vld1.u64        {d24}, [r2], r3
+    vld1.u64        {d25}, [r2], r3
 
-	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!
-	vld1.u64		{d26}, [r2], r3
-	vld1.u64		{d27}, [r2], r3
-	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!
-	vswp			d1, d4
-	vswp			d3, d6
-	vswp			q1, q2						// q0~q3
+    vld4.s16        {d0, d1, d2, d3}, [r4]!     // cost 3 cycles!
+    vld1.u64        {d26}, [r2], r3
+    vld1.u64        {d27}, [r2], r3
+    vld4.s16        {d4, d5, d6, d7}, [r4]!     // cost 3 cycles!
+    vswp            d1, d4
+    vswp            d3, d6
+    vswp            q1, q2                      // q0~q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
-	vrshr.s16		q0, q0, #6
-	vrshr.s16		q1, q1, #6
-	vrshr.s16		q2, q2, #6
-	vrshr.s16		q3, q3, #6
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
+    vrshr.s16       q0, q0, #6
+    vrshr.s16       q1, q1, #6
+    vrshr.s16       q2, q2, #6
+    vrshr.s16       q3, q3, #6
 
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q8,d24
-	vadd.s16		q0,q8
-	vqmovun.s16	d24,q0
-	vst1.u8		{d24},[r0],r1
+    //after rounding 6, clip into [0, 255]
+    vmovl.u8        q8,d24
+    vadd.s16        q0,q8
+    vqmovun.s16 d24,q0
+    vst1.u8     {d24},[r0],r1
 
-	vmovl.u8		q8,d25
-	vadd.s16		q1,q8
-	vqmovun.s16	d25,q1
-	vst1.u8		{d25},[r0],r1
+    vmovl.u8        q8,d25
+    vadd.s16        q1,q8
+    vqmovun.s16 d25,q1
+    vst1.u8     {d25},[r0],r1
 
-	vmovl.u8		q8,d26
-	vadd.s16		q2,q8
-	vqmovun.s16	d26,q2
-	vst1.u8		{d26},[r0],r1
+    vmovl.u8        q8,d26
+    vadd.s16        q2,q8
+    vqmovun.s16 d26,q2
+    vst1.u8     {d26},[r0],r1
 
-	vmovl.u8		q8,d27
-	vadd.s16		q3,q8
-	vqmovun.s16	d27,q3
-	vst1.u8		{d27},[r0],r1
+    vmovl.u8        q8,d27
+    vadd.s16        q3,q8
+    vqmovun.s16 d27,q3
+    vst1.u8     {d27},[r0],r1
 
-	pop			{r4}
+    pop         {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	vld1.s16	{q8,q9}, [r4]
-	vrshr.s16		q8, q8, #6
-	vrshr.s16		q9, q9, #6
+    vld1.s16    {q8,q9}, [r4]
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
 
-	vdup.s16	d20, d16[0]
-	vdup.s16	d21, d16[1]
-	vdup.s16	d22, d16[2]
-	vdup.s16	d23, d16[3]
+    vdup.s16    d20, d16[0]
+    vdup.s16    d21, d16[1]
+    vdup.s16    d22, d16[2]
+    vdup.s16    d23, d16[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vdup.s16	d20, d17[0]
-	vdup.s16	d21, d17[1]
-	vdup.s16	d22, d17[2]
-	vdup.s16	d23, d17[3]
+    vdup.s16    d20, d17[0]
+    vdup.s16    d21, d17[1]
+    vdup.s16    d22, d17[2]
+    vdup.s16    d23, d17[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vdup.s16	d20, d18[0]
-	vdup.s16	d21, d18[1]
-	vdup.s16	d22, d18[2]
-	vdup.s16	d23, d18[3]
+    vdup.s16    d20, d18[0]
+    vdup.s16    d21, d18[1]
+    vdup.s16    d22, d18[2]
+    vdup.s16    d23, d18[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vdup.s16	d20, d19[0]
-	vdup.s16	d21, d19[1]
-	vdup.s16	d22, d19[2]
-	vdup.s16	d23, d19[3]
+    vdup.s16    d20, d19[0]
+    vdup.s16    d21, d19[1]
+    vdup.s16    d22, d19[2]
+    vdup.s16    d23, d19[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	pop			{r4}
+    pop         {r4}
 WELS_ASM_FUNC_END
 #endif
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -55,262 +55,262 @@
 
 align 16
 byte_1pos_table:
-	db 0,0,0,0,0,0,0,0, ;0
-	db 0,0,0,0,0,0,0,1, ;1
-	db 1,0,0,0,0,0,0,1, ;2
-	db 1,0,0,0,0,0,0,2, ;3
-	db 2,0,0,0,0,0,0,1, ;4
-	db 2,0,0,0,0,0,0,2, ;5
-	db 2,1,0,0,0,0,0,2, ;6
-	db 2,1,0,0,0,0,0,3, ;7
-	db 3,0,0,0,0,0,0,1, ;8
-	db 3,0,0,0,0,0,0,2, ;9
-	db 3,1,0,0,0,0,0,2, ;10
-	db 3,1,0,0,0,0,0,3, ;11
-	db 3,2,0,0,0,0,0,2, ;12
-	db 3,2,0,0,0,0,0,3, ;13
-	db 3,2,1,0,0,0,0,3, ;14
-	db 3,2,1,0,0,0,0,4, ;15
-	db 4,0,0,0,0,0,0,1, ;16
-	db 4,0,0,0,0,0,0,2, ;17
-	db 4,1,0,0,0,0,0,2, ;18
-	db 4,1,0,0,0,0,0,3, ;19
-	db 4,2,0,0,0,0,0,2, ;20
-	db 4,2,0,0,0,0,0,3, ;21
-	db 4,2,1,0,0,0,0,3, ;22
-	db 4,2,1,0,0,0,0,4, ;23
-	db 4,3,0,0,0,0,0,2, ;24
-	db 4,3,0,0,0,0,0,3, ;25
-	db 4,3,1,0,0,0,0,3, ;26
-	db 4,3,1,0,0,0,0,4, ;27
-	db 4,3,2,0,0,0,0,3, ;28
-	db 4,3,2,0,0,0,0,4, ;29
-	db 4,3,2,1,0,0,0,4, ;30
-	db 4,3,2,1,0,0,0,5, ;31
-	db 5,0,0,0,0,0,0,1, ;32
-	db 5,0,0,0,0,0,0,2, ;33
-	db 5,1,0,0,0,0,0,2, ;34
-	db 5,1,0,0,0,0,0,3, ;35
-	db 5,2,0,0,0,0,0,2, ;36
-	db 5,2,0,0,0,0,0,3, ;37
-	db 5,2,1,0,0,0,0,3, ;38
-	db 5,2,1,0,0,0,0,4, ;39
-	db 5,3,0,0,0,0,0,2, ;40
-	db 5,3,0,0,0,0,0,3, ;41
-	db 5,3,1,0,0,0,0,3, ;42
-	db 5,3,1,0,0,0,0,4, ;43
-	db 5,3,2,0,0,0,0,3, ;44
-	db 5,3,2,0,0,0,0,4, ;45
-	db 5,3,2,1,0,0,0,4, ;46
-	db 5,3,2,1,0,0,0,5, ;47
-	db 5,4,0,0,0,0,0,2, ;48
-	db 5,4,0,0,0,0,0,3, ;49
-	db 5,4,1,0,0,0,0,3, ;50
-	db 5,4,1,0,0,0,0,4, ;51
-	db 5,4,2,0,0,0,0,3, ;52
-	db 5,4,2,0,0,0,0,4, ;53
-	db 5,4,2,1,0,0,0,4, ;54
-	db 5,4,2,1,0,0,0,5, ;55
-	db 5,4,3,0,0,0,0,3, ;56
-	db 5,4,3,0,0,0,0,4, ;57
-	db 5,4,3,1,0,0,0,4, ;58
-	db 5,4,3,1,0,0,0,5, ;59
-	db 5,4,3,2,0,0,0,4, ;60
-	db 5,4,3,2,0,0,0,5, ;61
-	db 5,4,3,2,1,0,0,5, ;62
-	db 5,4,3,2,1,0,0,6, ;63
-	db 6,0,0,0,0,0,0,1, ;64
-	db 6,0,0,0,0,0,0,2, ;65
-	db 6,1,0,0,0,0,0,2, ;66
-	db 6,1,0,0,0,0,0,3, ;67
-	db 6,2,0,0,0,0,0,2, ;68
-	db 6,2,0,0,0,0,0,3, ;69
-	db 6,2,1,0,0,0,0,3, ;70
-	db 6,2,1,0,0,0,0,4, ;71
-	db 6,3,0,0,0,0,0,2, ;72
-	db 6,3,0,0,0,0,0,3, ;73
-	db 6,3,1,0,0,0,0,3, ;74
-	db 6,3,1,0,0,0,0,4, ;75
-	db 6,3,2,0,0,0,0,3, ;76
-	db 6,3,2,0,0,0,0,4, ;77
-	db 6,3,2,1,0,0,0,4, ;78
-	db 6,3,2,1,0,0,0,5, ;79
-	db 6,4,0,0,0,0,0,2, ;80
-	db 6,4,0,0,0,0,0,3, ;81
-	db 6,4,1,0,0,0,0,3, ;82
-	db 6,4,1,0,0,0,0,4, ;83
-	db 6,4,2,0,0,0,0,3, ;84
-	db 6,4,2,0,0,0,0,4, ;85
-	db 6,4,2,1,0,0,0,4, ;86
-	db 6,4,2,1,0,0,0,5, ;87
-	db 6,4,3,0,0,0,0,3, ;88
-	db 6,4,3,0,0,0,0,4, ;89
-	db 6,4,3,1,0,0,0,4, ;90
-	db 6,4,3,1,0,0,0,5, ;91
-	db 6,4,3,2,0,0,0,4, ;92
-	db 6,4,3,2,0,0,0,5, ;93
-	db 6,4,3,2,1,0,0,5, ;94
-	db 6,4,3,2,1,0,0,6, ;95
-	db 6,5,0,0,0,0,0,2, ;96
-	db 6,5,0,0,0,0,0,3, ;97
-	db 6,5,1,0,0,0,0,3, ;98
-	db 6,5,1,0,0,0,0,4, ;99
-	db 6,5,2,0,0,0,0,3, ;100
-	db 6,5,2,0,0,0,0,4, ;101
-	db 6,5,2,1,0,0,0,4, ;102
-	db 6,5,2,1,0,0,0,5, ;103
-	db 6,5,3,0,0,0,0,3, ;104
-	db 6,5,3,0,0,0,0,4, ;105
-	db 6,5,3,1,0,0,0,4, ;106
-	db 6,5,3,1,0,0,0,5, ;107
-	db 6,5,3,2,0,0,0,4, ;108
-	db 6,5,3,2,0,0,0,5, ;109
-	db 6,5,3,2,1,0,0,5, ;110
-	db 6,5,3,2,1,0,0,6, ;111
-	db 6,5,4,0,0,0,0,3, ;112
-	db 6,5,4,0,0,0,0,4, ;113
-	db 6,5,4,1,0,0,0,4, ;114
-	db 6,5,4,1,0,0,0,5, ;115
-	db 6,5,4,2,0,0,0,4, ;116
-	db 6,5,4,2,0,0,0,5, ;117
-	db 6,5,4,2,1,0,0,5, ;118
-	db 6,5,4,2,1,0,0,6, ;119
-	db 6,5,4,3,0,0,0,4, ;120
-	db 6,5,4,3,0,0,0,5, ;121
-	db 6,5,4,3,1,0,0,5, ;122
-	db 6,5,4,3,1,0,0,6, ;123
-	db 6,5,4,3,2,0,0,5, ;124
-	db 6,5,4,3,2,0,0,6, ;125
-	db 6,5,4,3,2,1,0,6, ;126
-	db 6,5,4,3,2,1,0,7, ;127
-	db 7,0,0,0,0,0,0,1, ;128
-	db 7,0,0,0,0,0,0,2, ;129
-	db 7,1,0,0,0,0,0,2, ;130
-	db 7,1,0,0,0,0,0,3, ;131
-	db 7,2,0,0,0,0,0,2, ;132
-	db 7,2,0,0,0,0,0,3, ;133
-	db 7,2,1,0,0,0,0,3, ;134
-	db 7,2,1,0,0,0,0,4, ;135
-	db 7,3,0,0,0,0,0,2, ;136
-	db 7,3,0,0,0,0,0,3, ;137
-	db 7,3,1,0,0,0,0,3, ;138
-	db 7,3,1,0,0,0,0,4, ;139
-	db 7,3,2,0,0,0,0,3, ;140
-	db 7,3,2,0,0,0,0,4, ;141
-	db 7,3,2,1,0,0,0,4, ;142
-	db 7,3,2,1,0,0,0,5, ;143
-	db 7,4,0,0,0,0,0,2, ;144
-	db 7,4,0,0,0,0,0,3, ;145
-	db 7,4,1,0,0,0,0,3, ;146
-	db 7,4,1,0,0,0,0,4, ;147
-	db 7,4,2,0,0,0,0,3, ;148
-	db 7,4,2,0,0,0,0,4, ;149
-	db 7,4,2,1,0,0,0,4, ;150
-	db 7,4,2,1,0,0,0,5, ;151
-	db 7,4,3,0,0,0,0,3, ;152
-	db 7,4,3,0,0,0,0,4, ;153
-	db 7,4,3,1,0,0,0,4, ;154
-	db 7,4,3,1,0,0,0,5, ;155
-	db 7,4,3,2,0,0,0,4, ;156
-	db 7,4,3,2,0,0,0,5, ;157
-	db 7,4,3,2,1,0,0,5, ;158
-	db 7,4,3,2,1,0,0,6, ;159
-	db 7,5,0,0,0,0,0,2, ;160
-	db 7,5,0,0,0,0,0,3, ;161
-	db 7,5,1,0,0,0,0,3, ;162
-	db 7,5,1,0,0,0,0,4, ;163
-	db 7,5,2,0,0,0,0,3, ;164
-	db 7,5,2,0,0,0,0,4, ;165
-	db 7,5,2,1,0,0,0,4, ;166
-	db 7,5,2,1,0,0,0,5, ;167
-	db 7,5,3,0,0,0,0,3, ;168
-	db 7,5,3,0,0,0,0,4, ;169
-	db 7,5,3,1,0,0,0,4, ;170
-	db 7,5,3,1,0,0,0,5, ;171
-	db 7,5,3,2,0,0,0,4, ;172
-	db 7,5,3,2,0,0,0,5, ;173
-	db 7,5,3,2,1,0,0,5, ;174
-	db 7,5,3,2,1,0,0,6, ;175
-	db 7,5,4,0,0,0,0,3, ;176
-	db 7,5,4,0,0,0,0,4, ;177
-	db 7,5,4,1,0,0,0,4, ;178
-	db 7,5,4,1,0,0,0,5, ;179
-	db 7,5,4,2,0,0,0,4, ;180
-	db 7,5,4,2,0,0,0,5, ;181
-	db 7,5,4,2,1,0,0,5, ;182
-	db 7,5,4,2,1,0,0,6, ;183
-	db 7,5,4,3,0,0,0,4, ;184
-	db 7,5,4,3,0,0,0,5, ;185
-	db 7,5,4,3,1,0,0,5, ;186
-	db 7,5,4,3,1,0,0,6, ;187
-	db 7,5,4,3,2,0,0,5, ;188
-	db 7,5,4,3,2,0,0,6, ;189
-	db 7,5,4,3,2,1,0,6, ;190
-	db 7,5,4,3,2,1,0,7, ;191
-	db 7,6,0,0,0,0,0,2, ;192
-	db 7,6,0,0,0,0,0,3, ;193
-	db 7,6,1,0,0,0,0,3, ;194
-	db 7,6,1,0,0,0,0,4, ;195
-	db 7,6,2,0,0,0,0,3, ;196
-	db 7,6,2,0,0,0,0,4, ;197
-	db 7,6,2,1,0,0,0,4, ;198
-	db 7,6,2,1,0,0,0,5, ;199
-	db 7,6,3,0,0,0,0,3, ;200
-	db 7,6,3,0,0,0,0,4, ;201
-	db 7,6,3,1,0,0,0,4, ;202
-	db 7,6,3,1,0,0,0,5, ;203
-	db 7,6,3,2,0,0,0,4, ;204
-	db 7,6,3,2,0,0,0,5, ;205
-	db 7,6,3,2,1,0,0,5, ;206
-	db 7,6,3,2,1,0,0,6, ;207
-	db 7,6,4,0,0,0,0,3, ;208
-	db 7,6,4,0,0,0,0,4, ;209
-	db 7,6,4,1,0,0,0,4, ;210
-	db 7,6,4,1,0,0,0,5, ;211
-	db 7,6,4,2,0,0,0,4, ;212
-	db 7,6,4,2,0,0,0,5, ;213
-	db 7,6,4,2,1,0,0,5, ;214
-	db 7,6,4,2,1,0,0,6, ;215
-	db 7,6,4,3,0,0,0,4, ;216
-	db 7,6,4,3,0,0,0,5, ;217
-	db 7,6,4,3,1,0,0,5, ;218
-	db 7,6,4,3,1,0,0,6, ;219
-	db 7,6,4,3,2,0,0,5, ;220
-	db 7,6,4,3,2,0,0,6, ;221
-	db 7,6,4,3,2,1,0,6, ;222
-	db 7,6,4,3,2,1,0,7, ;223
-	db 7,6,5,0,0,0,0,3, ;224
-	db 7,6,5,0,0,0,0,4, ;225
-	db 7,6,5,1,0,0,0,4, ;226
-	db 7,6,5,1,0,0,0,5, ;227
-	db 7,6,5,2,0,0,0,4, ;228
-	db 7,6,5,2,0,0,0,5, ;229
-	db 7,6,5,2,1,0,0,5, ;230
-	db 7,6,5,2,1,0,0,6, ;231
-	db 7,6,5,3,0,0,0,4, ;232
-	db 7,6,5,3,0,0,0,5, ;233
-	db 7,6,5,3,1,0,0,5, ;234
-	db 7,6,5,3,1,0,0,6, ;235
-	db 7,6,5,3,2,0,0,5, ;236
-	db 7,6,5,3,2,0,0,6, ;237
-	db 7,6,5,3,2,1,0,6, ;238
-	db 7,6,5,3,2,1,0,7, ;239
-	db 7,6,5,4,0,0,0,4, ;240
-	db 7,6,5,4,0,0,0,5, ;241
-	db 7,6,5,4,1,0,0,5, ;242
-	db 7,6,5,4,1,0,0,6, ;243
-	db 7,6,5,4,2,0,0,5, ;244
-	db 7,6,5,4,2,0,0,6, ;245
-	db 7,6,5,4,2,1,0,6, ;246
-	db 7,6,5,4,2,1,0,7, ;247
-	db 7,6,5,4,3,0,0,5, ;248
-	db 7,6,5,4,3,0,0,6, ;249
-	db 7,6,5,4,3,1,0,6, ;250
-	db 7,6,5,4,3,1,0,7, ;251
-	db 7,6,5,4,3,2,0,6, ;252
-	db 7,6,5,4,3,2,0,7, ;253
-	db 7,6,5,4,3,2,1,7, ;254
-	db 7,6,5,4,3,2,1,8, ;255
+    db 0,0,0,0,0,0,0,0, ;0
+    db 0,0,0,0,0,0,0,1, ;1
+    db 1,0,0,0,0,0,0,1, ;2
+    db 1,0,0,0,0,0,0,2, ;3
+    db 2,0,0,0,0,0,0,1, ;4
+    db 2,0,0,0,0,0,0,2, ;5
+    db 2,1,0,0,0,0,0,2, ;6
+    db 2,1,0,0,0,0,0,3, ;7
+    db 3,0,0,0,0,0,0,1, ;8
+    db 3,0,0,0,0,0,0,2, ;9
+    db 3,1,0,0,0,0,0,2, ;10
+    db 3,1,0,0,0,0,0,3, ;11
+    db 3,2,0,0,0,0,0,2, ;12
+    db 3,2,0,0,0,0,0,3, ;13
+    db 3,2,1,0,0,0,0,3, ;14
+    db 3,2,1,0,0,0,0,4, ;15
+    db 4,0,0,0,0,0,0,1, ;16
+    db 4,0,0,0,0,0,0,2, ;17
+    db 4,1,0,0,0,0,0,2, ;18
+    db 4,1,0,0,0,0,0,3, ;19
+    db 4,2,0,0,0,0,0,2, ;20
+    db 4,2,0,0,0,0,0,3, ;21
+    db 4,2,1,0,0,0,0,3, ;22
+    db 4,2,1,0,0,0,0,4, ;23
+    db 4,3,0,0,0,0,0,2, ;24
+    db 4,3,0,0,0,0,0,3, ;25
+    db 4,3,1,0,0,0,0,3, ;26
+    db 4,3,1,0,0,0,0,4, ;27
+    db 4,3,2,0,0,0,0,3, ;28
+    db 4,3,2,0,0,0,0,4, ;29
+    db 4,3,2,1,0,0,0,4, ;30
+    db 4,3,2,1,0,0,0,5, ;31
+    db 5,0,0,0,0,0,0,1, ;32
+    db 5,0,0,0,0,0,0,2, ;33
+    db 5,1,0,0,0,0,0,2, ;34
+    db 5,1,0,0,0,0,0,3, ;35
+    db 5,2,0,0,0,0,0,2, ;36
+    db 5,2,0,0,0,0,0,3, ;37
+    db 5,2,1,0,0,0,0,3, ;38
+    db 5,2,1,0,0,0,0,4, ;39
+    db 5,3,0,0,0,0,0,2, ;40
+    db 5,3,0,0,0,0,0,3, ;41
+    db 5,3,1,0,0,0,0,3, ;42
+    db 5,3,1,0,0,0,0,4, ;43
+    db 5,3,2,0,0,0,0,3, ;44
+    db 5,3,2,0,0,0,0,4, ;45
+    db 5,3,2,1,0,0,0,4, ;46
+    db 5,3,2,1,0,0,0,5, ;47
+    db 5,4,0,0,0,0,0,2, ;48
+    db 5,4,0,0,0,0,0,3, ;49
+    db 5,4,1,0,0,0,0,3, ;50
+    db 5,4,1,0,0,0,0,4, ;51
+    db 5,4,2,0,0,0,0,3, ;52
+    db 5,4,2,0,0,0,0,4, ;53
+    db 5,4,2,1,0,0,0,4, ;54
+    db 5,4,2,1,0,0,0,5, ;55
+    db 5,4,3,0,0,0,0,3, ;56
+    db 5,4,3,0,0,0,0,4, ;57
+    db 5,4,3,1,0,0,0,4, ;58
+    db 5,4,3,1,0,0,0,5, ;59
+    db 5,4,3,2,0,0,0,4, ;60
+    db 5,4,3,2,0,0,0,5, ;61
+    db 5,4,3,2,1,0,0,5, ;62
+    db 5,4,3,2,1,0,0,6, ;63
+    db 6,0,0,0,0,0,0,1, ;64
+    db 6,0,0,0,0,0,0,2, ;65
+    db 6,1,0,0,0,0,0,2, ;66
+    db 6,1,0,0,0,0,0,3, ;67
+    db 6,2,0,0,0,0,0,2, ;68
+    db 6,2,0,0,0,0,0,3, ;69
+    db 6,2,1,0,0,0,0,3, ;70
+    db 6,2,1,0,0,0,0,4, ;71
+    db 6,3,0,0,0,0,0,2, ;72
+    db 6,3,0,0,0,0,0,3, ;73
+    db 6,3,1,0,0,0,0,3, ;74
+    db 6,3,1,0,0,0,0,4, ;75
+    db 6,3,2,0,0,0,0,3, ;76
+    db 6,3,2,0,0,0,0,4, ;77
+    db 6,3,2,1,0,0,0,4, ;78
+    db 6,3,2,1,0,0,0,5, ;79
+    db 6,4,0,0,0,0,0,2, ;80
+    db 6,4,0,0,0,0,0,3, ;81
+    db 6,4,1,0,0,0,0,3, ;82
+    db 6,4,1,0,0,0,0,4, ;83
+    db 6,4,2,0,0,0,0,3, ;84
+    db 6,4,2,0,0,0,0,4, ;85
+    db 6,4,2,1,0,0,0,4, ;86
+    db 6,4,2,1,0,0,0,5, ;87
+    db 6,4,3,0,0,0,0,3, ;88
+    db 6,4,3,0,0,0,0,4, ;89
+    db 6,4,3,1,0,0,0,4, ;90
+    db 6,4,3,1,0,0,0,5, ;91
+    db 6,4,3,2,0,0,0,4, ;92
+    db 6,4,3,2,0,0,0,5, ;93
+    db 6,4,3,2,1,0,0,5, ;94
+    db 6,4,3,2,1,0,0,6, ;95
+    db 6,5,0,0,0,0,0,2, ;96
+    db 6,5,0,0,0,0,0,3, ;97
+    db 6,5,1,0,0,0,0,3, ;98
+    db 6,5,1,0,0,0,0,4, ;99
+    db 6,5,2,0,0,0,0,3, ;100
+    db 6,5,2,0,0,0,0,4, ;101
+    db 6,5,2,1,0,0,0,4, ;102
+    db 6,5,2,1,0,0,0,5, ;103
+    db 6,5,3,0,0,0,0,3, ;104
+    db 6,5,3,0,0,0,0,4, ;105
+    db 6,5,3,1,0,0,0,4, ;106
+    db 6,5,3,1,0,0,0,5, ;107
+    db 6,5,3,2,0,0,0,4, ;108
+    db 6,5,3,2,0,0,0,5, ;109
+    db 6,5,3,2,1,0,0,5, ;110
+    db 6,5,3,2,1,0,0,6, ;111
+    db 6,5,4,0,0,0,0,3, ;112
+    db 6,5,4,0,0,0,0,4, ;113
+    db 6,5,4,1,0,0,0,4, ;114
+    db 6,5,4,1,0,0,0,5, ;115
+    db 6,5,4,2,0,0,0,4, ;116
+    db 6,5,4,2,0,0,0,5, ;117
+    db 6,5,4,2,1,0,0,5, ;118
+    db 6,5,4,2,1,0,0,6, ;119
+    db 6,5,4,3,0,0,0,4, ;120
+    db 6,5,4,3,0,0,0,5, ;121
+    db 6,5,4,3,1,0,0,5, ;122
+    db 6,5,4,3,1,0,0,6, ;123
+    db 6,5,4,3,2,0,0,5, ;124
+    db 6,5,4,3,2,0,0,6, ;125
+    db 6,5,4,3,2,1,0,6, ;126
+    db 6,5,4,3,2,1,0,7, ;127
+    db 7,0,0,0,0,0,0,1, ;128
+    db 7,0,0,0,0,0,0,2, ;129
+    db 7,1,0,0,0,0,0,2, ;130
+    db 7,1,0,0,0,0,0,3, ;131
+    db 7,2,0,0,0,0,0,2, ;132
+    db 7,2,0,0,0,0,0,3, ;133
+    db 7,2,1,0,0,0,0,3, ;134
+    db 7,2,1,0,0,0,0,4, ;135
+    db 7,3,0,0,0,0,0,2, ;136
+    db 7,3,0,0,0,0,0,3, ;137
+    db 7,3,1,0,0,0,0,3, ;138
+    db 7,3,1,0,0,0,0,4, ;139
+    db 7,3,2,0,0,0,0,3, ;140
+    db 7,3,2,0,0,0,0,4, ;141
+    db 7,3,2,1,0,0,0,4, ;142
+    db 7,3,2,1,0,0,0,5, ;143
+    db 7,4,0,0,0,0,0,2, ;144
+    db 7,4,0,0,0,0,0,3, ;145
+    db 7,4,1,0,0,0,0,3, ;146
+    db 7,4,1,0,0,0,0,4, ;147
+    db 7,4,2,0,0,0,0,3, ;148
+    db 7,4,2,0,0,0,0,4, ;149
+    db 7,4,2,1,0,0,0,4, ;150
+    db 7,4,2,1,0,0,0,5, ;151
+    db 7,4,3,0,0,0,0,3, ;152
+    db 7,4,3,0,0,0,0,4, ;153
+    db 7,4,3,1,0,0,0,4, ;154
+    db 7,4,3,1,0,0,0,5, ;155
+    db 7,4,3,2,0,0,0,4, ;156
+    db 7,4,3,2,0,0,0,5, ;157
+    db 7,4,3,2,1,0,0,5, ;158
+    db 7,4,3,2,1,0,0,6, ;159
+    db 7,5,0,0,0,0,0,2, ;160
+    db 7,5,0,0,0,0,0,3, ;161
+    db 7,5,1,0,0,0,0,3, ;162
+    db 7,5,1,0,0,0,0,4, ;163
+    db 7,5,2,0,0,0,0,3, ;164
+    db 7,5,2,0,0,0,0,4, ;165
+    db 7,5,2,1,0,0,0,4, ;166
+    db 7,5,2,1,0,0,0,5, ;167
+    db 7,5,3,0,0,0,0,3, ;168
+    db 7,5,3,0,0,0,0,4, ;169
+    db 7,5,3,1,0,0,0,4, ;170
+    db 7,5,3,1,0,0,0,5, ;171
+    db 7,5,3,2,0,0,0,4, ;172
+    db 7,5,3,2,0,0,0,5, ;173
+    db 7,5,3,2,1,0,0,5, ;174
+    db 7,5,3,2,1,0,0,6, ;175
+    db 7,5,4,0,0,0,0,3, ;176
+    db 7,5,4,0,0,0,0,4, ;177
+    db 7,5,4,1,0,0,0,4, ;178
+    db 7,5,4,1,0,0,0,5, ;179
+    db 7,5,4,2,0,0,0,4, ;180
+    db 7,5,4,2,0,0,0,5, ;181
+    db 7,5,4,2,1,0,0,5, ;182
+    db 7,5,4,2,1,0,0,6, ;183
+    db 7,5,4,3,0,0,0,4, ;184
+    db 7,5,4,3,0,0,0,5, ;185
+    db 7,5,4,3,1,0,0,5, ;186
+    db 7,5,4,3,1,0,0,6, ;187
+    db 7,5,4,3,2,0,0,5, ;188
+    db 7,5,4,3,2,0,0,6, ;189
+    db 7,5,4,3,2,1,0,6, ;190
+    db 7,5,4,3,2,1,0,7, ;191
+    db 7,6,0,0,0,0,0,2, ;192
+    db 7,6,0,0,0,0,0,3, ;193
+    db 7,6,1,0,0,0,0,3, ;194
+    db 7,6,1,0,0,0,0,4, ;195
+    db 7,6,2,0,0,0,0,3, ;196
+    db 7,6,2,0,0,0,0,4, ;197
+    db 7,6,2,1,0,0,0,4, ;198
+    db 7,6,2,1,0,0,0,5, ;199
+    db 7,6,3,0,0,0,0,3, ;200
+    db 7,6,3,0,0,0,0,4, ;201
+    db 7,6,3,1,0,0,0,4, ;202
+    db 7,6,3,1,0,0,0,5, ;203
+    db 7,6,3,2,0,0,0,4, ;204
+    db 7,6,3,2,0,0,0,5, ;205
+    db 7,6,3,2,1,0,0,5, ;206
+    db 7,6,3,2,1,0,0,6, ;207
+    db 7,6,4,0,0,0,0,3, ;208
+    db 7,6,4,0,0,0,0,4, ;209
+    db 7,6,4,1,0,0,0,4, ;210
+    db 7,6,4,1,0,0,0,5, ;211
+    db 7,6,4,2,0,0,0,4, ;212
+    db 7,6,4,2,0,0,0,5, ;213
+    db 7,6,4,2,1,0,0,5, ;214
+    db 7,6,4,2,1,0,0,6, ;215
+    db 7,6,4,3,0,0,0,4, ;216
+    db 7,6,4,3,0,0,0,5, ;217
+    db 7,6,4,3,1,0,0,5, ;218
+    db 7,6,4,3,1,0,0,6, ;219
+    db 7,6,4,3,2,0,0,5, ;220
+    db 7,6,4,3,2,0,0,6, ;221
+    db 7,6,4,3,2,1,0,6, ;222
+    db 7,6,4,3,2,1,0,7, ;223
+    db 7,6,5,0,0,0,0,3, ;224
+    db 7,6,5,0,0,0,0,4, ;225
+    db 7,6,5,1,0,0,0,4, ;226
+    db 7,6,5,1,0,0,0,5, ;227
+    db 7,6,5,2,0,0,0,4, ;228
+    db 7,6,5,2,0,0,0,5, ;229
+    db 7,6,5,2,1,0,0,5, ;230
+    db 7,6,5,2,1,0,0,6, ;231
+    db 7,6,5,3,0,0,0,4, ;232
+    db 7,6,5,3,0,0,0,5, ;233
+    db 7,6,5,3,1,0,0,5, ;234
+    db 7,6,5,3,1,0,0,6, ;235
+    db 7,6,5,3,2,0,0,5, ;236
+    db 7,6,5,3,2,0,0,6, ;237
+    db 7,6,5,3,2,1,0,6, ;238
+    db 7,6,5,3,2,1,0,7, ;239
+    db 7,6,5,4,0,0,0,4, ;240
+    db 7,6,5,4,0,0,0,5, ;241
+    db 7,6,5,4,1,0,0,5, ;242
+    db 7,6,5,4,1,0,0,6, ;243
+    db 7,6,5,4,2,0,0,5, ;244
+    db 7,6,5,4,2,0,0,6, ;245
+    db 7,6,5,4,2,1,0,6, ;246
+    db 7,6,5,4,2,1,0,7, ;247
+    db 7,6,5,4,3,0,0,5, ;248
+    db 7,6,5,4,3,0,0,6, ;249
+    db 7,6,5,4,3,1,0,6, ;250
+    db 7,6,5,4,3,1,0,7, ;251
+    db 7,6,5,4,3,2,0,6, ;252
+    db 7,6,5,4,3,2,0,7, ;253
+    db 7,6,5,4,3,2,1,7, ;254
+    db 7,6,5,4,3,2,1,8, ;255
 
 ;***********************************************************************
 ; Code
@@ -323,43 +323,43 @@
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
-	push ebx
-	push edi
-	push esi
+    push ebx
+    push edi
+    push esi
 
-	mov			eax,	[esp+16]	;coffLevel
-	mov			edi,	[esp+24]	;Level
-	mov			ebx,	[esp+32]	;endIdx
-	cmp			ebx,	3
-	jne			.Level16
-	pxor		xmm1,	xmm1
-	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin
+    mov         eax,    [esp+16]    ;coffLevel
+    mov         edi,    [esp+24]    ;Level
+    mov         ebx,    [esp+32]    ;endIdx
+    cmp         ebx,    3
+    jne         .Level16
+    pxor        xmm1,   xmm1
+    movq        xmm0,   [eax]   ; removed QWORD
+    jmp         .Cal_begin
 .Level16:
-	movdqa		xmm0,	[eax]
-	movdqa		xmm1,	[eax+16]
+    movdqa      xmm0,   [eax]
+    movdqa      xmm1,   [eax+16]
 .Cal_begin:
-    movdqa		xmm2,	xmm0
-	packsswb	xmm0,	xmm1
-	movdqa		xmm4,	xmm0
-	pxor		xmm3,	xmm3
-	pcmpgtb		xmm0,	xmm3
-	pcmpgtb		xmm3,	xmm4
-	por			xmm0,	xmm3
-	pmovmskb	edx,	xmm0
-	cmp			edx,	0
-	je near   .return
-	movdqa		xmm6,	[sse2_b_1]
-	pcmpeqw		xmm7,	xmm7	;generate -1
-    mov			ebx,	0xff
-    ;pinsrw		xmm6,	ebx,	3
+    movdqa      xmm2,   xmm0
+    packsswb    xmm0,   xmm1
+    movdqa      xmm4,   xmm0
+    pxor        xmm3,   xmm3
+    pcmpgtb     xmm0,   xmm3
+    pcmpgtb     xmm3,   xmm4
+    por         xmm0,   xmm3
+    pmovmskb    edx,    xmm0
+    cmp         edx,    0
+    je near   .return
+    movdqa      xmm6,   [sse2_b_1]
+    pcmpeqw     xmm7,   xmm7    ;generate -1
+    mov         ebx,    0xff
+    ;pinsrw     xmm6,   ebx,    3
 
     mov       bl,   dh
 
-	lea       ebx,  [byte_1pos_table+8*ebx]
-	movq      xmm0, [ebx]
-	pextrw    ecx,  xmm0, 3
-	shr       ecx,  8
+    lea       ebx,  [byte_1pos_table+8*ebx]
+    movq      xmm0, [ebx]
+    pextrw    ecx,  xmm0, 3
+    shr       ecx,  8
     mov       dh,   cl
 
 .loopHighFind0:
@@ -367,19 +367,19 @@
     je        .loopHighFind0End
     ;mov       esi, [ebx]
     ;and       esi, 0xff
-    movzx	  esi, byte [ebx]
+    movzx     esi, byte [ebx]
     add       esi, 8
     mov       esi, [eax+2*esi]
     mov       [edi], si
     add       edi,   2
     ;add       ebx,   1
-    inc		  ebx
+    inc       ebx
     dec       ecx
-	jmp       .loopHighFind0
+    jmp       .loopHighFind0
 .loopHighFind0End:
     mov       cl,   dh
     cmp       cl,   8
-	pand      xmm0, xmm6
+    pand      xmm0, xmm6
     jne       .LowByteFind0
     sub       edi,   2
     mov       esi,   [eax+16]
@@ -387,8 +387,8 @@
     add       edi,   2
 .LowByteFind0:
     and       edx,  0xff
-	lea       ebx,  [byte_1pos_table+8*edx]
-	movq      xmm1, [ebx]
+    lea       ebx,  [byte_1pos_table+8*edx]
+    movq      xmm1, [ebx]
     pextrw    esi,  xmm1, 3
     or        esi,  0xff
     or        ecx,  0xff00
@@ -398,16 +398,16 @@
 .loopLowFind0:
     cmp       esi, 0
     je        .loopLowFind0End
-	;mov       edx, [ebx]
-	;and       edx, 0xff
-	movzx	  edx,	byte [ebx]
-	mov       edx, [eax+2*edx]
-	mov       [edi], dx
-	add       edi,   2
-	;add       ebx,   1
-	inc		  ebx
+    ;mov       edx, [ebx]
+    ;and       edx, 0xff
+    movzx     edx,  byte [ebx]
+    mov       edx, [eax+2*edx]
+    mov       [edi], dx
+    add       edi,   2
+    ;add       ebx,   1
+    inc       ebx
     dec       esi
-	jmp       .loopLowFind0
+    jmp       .loopLowFind0
 .loopLowFind0End:
     cmp       ch,  8
     jne       .getLevelEnd
@@ -415,12 +415,12 @@
     mov       edx, [eax]
     mov       [edi], dx
 .getLevelEnd:
-	mov      edx, [esp+28]	;total_coeffs
+    mov      edx, [esp+28]  ;total_coeffs
     ;mov      ebx,   ecx
     ;and      ebx,   0xff
-    movzx	 ebx,	byte cl
+    movzx    ebx,   byte cl
     add      cl,    ch
-	mov      [edx], cl
+    mov      [edx], cl
 ;getRun
     movq     xmm5, [sse2_b8]
     paddb    xmm0, xmm5
@@ -430,7 +430,7 @@
     sub      eax,  ebx
     shl      eax,  3
     shl      ebx,  3
-	pinsrw   xmm2, ebx, 0
+    pinsrw   xmm2, ebx, 0
     pinsrw   xmm3, eax, 0
     psllq    xmm0, xmm3
     psrlq    xmm0, xmm3
@@ -441,19 +441,19 @@
     por      xmm0,  xmm1
 
     pextrw   eax,   xmm0, 0
-    and		 eax,   0xff
+    and      eax,   0xff
     inc      eax
     sub      al,    cl
-	movdqa   xmm1,  xmm0
-	paddb    xmm1,  xmm7
-	psrldq   xmm0,  1
-	psubb    xmm1,  xmm0
+    movdqa   xmm1,  xmm0
+    paddb    xmm1,  xmm7
+    psrldq   xmm0,  1
+    psubb    xmm1,  xmm0
     mov      ecx,   [esp+20] ;run
-	movdqa   [ecx], xmm1
+    movdqa   [ecx], xmm1
 ;getRunEnd
 .return:
-	pop esi
-	pop edi
-	pop ebx
-	ret
+    pop esi
+    pop edi
+    pop ebx
+    ret
 %endif
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -50,17 +50,17 @@
 
 align 16
 SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
-			dw	10, 13, 10, 13, 13, 16, 13, 16,
+            dw  10, 13, 10, 13, 13, 16, 13, 16,
             dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20,
-			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
+            dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  13, 16, 13, 16, 16, 20, 16, 20,
             dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25,
-			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
+            dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  16, 20, 16, 20, 20, 25, 20, 25,
             dw  18, 23, 18, 23, 23, 29, 23, 29,
-			dw  18, 23, 18, 23, 23, 29, 23, 29
+            dw  18, 23, 18, 23, 23, 29, 23, 29
 
 
 ;***********************************************************************
@@ -68,27 +68,27 @@
 ;***********************************************************************
 
 %macro MMX_LoadDiff4P 5
-	movd        %1, [%3]
-	movd        %2, [%4]
-	punpcklbw   %1, %5
-	punpcklbw   %2, %5
-	psubw       %1, %2
+    movd        %1, [%3]
+    movd        %2, [%4]
+    punpcklbw   %1, %5
+    punpcklbw   %2, %5
+    psubw       %1, %2
 %endmacro
 
 %macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
-	MMX_LoadDiff4P %1, %9, %5,    %7,    %10
-	MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
-	lea  %5, [%5+2*%6]
-	lea  %7, [%7+2*%8]
-	MMX_LoadDiff4P %3, %9, %5,    %7,    %10
-	MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+    MMX_LoadDiff4P %1, %9, %5,    %7,    %10
+    MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+    lea  %5, [%5+2*%6]
+    lea  %7, [%7+2*%8]
+    MMX_LoadDiff4P %3, %9, %5,    %7,    %10
+    MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
 %endmacro
 
 %macro MMX_SumSubMul2 3
-	movq    %3, %1
-	psllw   %1, $01
-	paddw   %1, %2
-	psllw   %2, $01
+    movq    %3, %1
+    psllw   %1, $01
+    paddw   %1, %2
+    psllw   %2, $01
     psubw   %3, %2
 %endmacro
 
@@ -101,15 +101,15 @@
 %endmacro
 
 %macro MMX_SumSub 3
-	movq    %3, %2
+    movq    %3, %2
     psubw   %2, %1
     paddw   %1, %3
 %endmacro
 
 %macro MMX_DCT 6
-    MMX_SumSub		%4, %1, %6
-    MMX_SumSub		%3, %2, %6
-    MMX_SumSub		%3, %4, %6
+    MMX_SumSub      %4, %1, %6
+    MMX_SumSub      %3, %2, %6
+    MMX_SumSub      %3, %4, %6
     MMX_SumSubMul2  %1, %2, %5
 %endmacro
 
@@ -116,8 +116,8 @@
 %macro MMX_IDCT 6
     MMX_SumSub      %4, %5, %6
     MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
+    MMX_SumSub      %1, %4, %6
+    MMX_SumSub      %3, %5, %6
 %endmacro
 
 %macro MMX_StoreDiff4P 6
@@ -142,11 +142,11 @@
 
     MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
 
-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
-    MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
+    MMX_DCT         mm1, mm2, mm3 ,mm4, mm5, mm6
+    MMX_Trans4x4W   mm3, mm1, mm4, mm5, mm2
 
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
-    MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
+    MMX_DCT         mm3, mm5, mm2 ,mm4, mm1, mm6
+    MMX_Trans4x4W   mm2, mm3, mm4, mm1, mm5
 
     movq    [r0+ 0],   mm2
     movq    [r0+ 8],   mm1
@@ -170,22 +170,22 @@
     movq    mm2, [r4+16]
     movq    mm3, [r4+24]
 
-	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W		mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+    MMX_Trans4x4W       mm0, mm1, mm2, mm3, mm4
+    MMX_IDCT            mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W       mm1, mm3, mm0, mm4, mm2
+    MMX_IDCT            mm3, mm0, mm4, mm2, mm1, mm6
 
-    WELS_Zero			mm7
-    WELS_DW32			mm6
+    WELS_Zero           mm7
+    WELS_DW32           mm6
 
-    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [r0], [r2]
-    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+    MMX_StoreDiff4P     mm3, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P     mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
     lea     r0, [r0+2*r1]
     lea     r2, [r2+2*r3]
-    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [r0], [r2]
-    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+    MMX_StoreDiff4P     mm1, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P     mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
 
-	WELSEMMS
+    WELSEMMS
     LOAD_5_PARA_POP
     ret
 
@@ -194,21 +194,21 @@
 ; SSE2 functions
 ;***********************************************************************
 %macro SSE2_Store4x8p 6
-	SSE2_XSawp qdq, %2, %3, %6
-	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2
-	MOVDQ    [%1+0x10], %4
-	MOVDQ    [%1+0x20], %6
-	MOVDQ    [%1+0x30], %3
+    SSE2_XSawp qdq, %2, %3, %6
+    SSE2_XSawp qdq, %4, %5, %3
+    MOVDQ    [%1+0x00], %2
+    MOVDQ    [%1+0x10], %4
+    MOVDQ    [%1+0x20], %6
+    MOVDQ    [%1+0x30], %3
 %endmacro
 
 %macro SSE2_Load4x8p 6
-	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]
-	MOVDQ    %6,	[%1+0x20]
-	MOVDQ    %3,	[%1+0x30]
-	SSE2_XSawp qdq, %4, %3, %5
-	SSE2_XSawp qdq, %2, %6, %3
+    MOVDQ    %2,    [%1+0x00]
+    MOVDQ    %4,    [%1+0x10]
+    MOVDQ    %6,    [%1+0x20]
+    MOVDQ    %3,    [%1+0x30]
+    SSE2_XSawp qdq, %4, %3, %5
+    SSE2_XSawp qdq, %2, %6, %3
 %endmacro
 
 %macro SSE2_SumSubMul2 3
@@ -231,57 +231,57 @@
 %macro SSE2_StoreDiff8p 6
     paddw       %1, %3
     psraw       %1, $06
-    movq		%2, %6
+    movq        %2, %6
     punpcklbw   %2, %4
     paddsw      %2, %1
     packuswb    %2, %2
-    movq	    %5, %2
+    movq        %5, %2
 %endmacro
 
 %macro SSE2_StoreDiff8p 5
-    movq		%2, %5
+    movq        %2, %5
     punpcklbw   %2, %3
     paddsw      %2, %1
     packuswb    %2, %2
-    movq	    %4, %2
+    movq        %4, %2
 %endmacro
 
-%macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1
-	paddw       %1,		%5
-    psraw       %1,		$06		; (dc + 32) >> 6
+%macro SSE2_Load8DC 6
+    movdqa      %1,     %6      ; %1 = dc0 dc1
+    paddw       %1,     %5
+    psraw       %1,     $06     ; (dc + 32) >> 6
 
-    movdqa		%2,		%1
-    psrldq		%2,		4
- 	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+    movdqa      %2,     %1
+    psrldq      %2,     4
+    punpcklwd   %2,     %2
+    punpckldq   %2,     %2      ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
 
-    movdqa		%3,		%1
-    psrldq		%3,		8
- 	punpcklwd	%3,		%3
-	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+    movdqa      %3,     %1
+    psrldq      %3,     8
+    punpcklwd   %3,     %3
+    punpckldq   %3,     %3      ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
 
-	movdqa		%4,		%1
-    psrldq		%4,		12
- 	punpcklwd	%4,		%4
-	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+    movdqa      %4,     %1
+    psrldq      %4,     12
+    punpcklwd   %4,     %4
+    punpckldq   %4,     %4      ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
 
-	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+    punpcklwd   %1,     %1
+    punpckldq   %1,     %1      ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
 %endmacro
 
 %macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5
-	SSE2_SumSub		%1, %2, %5
-	SSE2_SumSub		%3, %2, %5
-	SSE2_SumSubMul2		%6, %1, %4
+    SSE2_SumSub     %6, %3, %5
+    SSE2_SumSub     %1, %2, %5
+    SSE2_SumSub     %3, %2, %5
+    SSE2_SumSubMul2     %6, %1, %4
 %endmacro
 
 %macro SSE2_IDCT 7
     SSE2_SumSub       %7, %2, %6
     SSE2_SumSubDiv2     %1, %3, %5, %4
-    SSE2_SumSub	     %2, %1, %5
-    SSE2_SumSub		 %7, %4, %5
+    SSE2_SumSub      %2, %1, %5
+    SSE2_SumSub      %7, %4, %5
 %endmacro
 
 ;***********************************************************************
@@ -294,42 +294,42 @@
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
     pxor    xmm7, xmm7
-	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
+    ;Load 4x8
+    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
-	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
+    lea     r1, [r1 + 2 * r2]
+    lea     r3, [r3 + 2 * r4]
+    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
-	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0
 
-	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
+    lea     r1, [r1 + 2 * r2]
+    lea     r3, [r3 + 2 * r4]
 
-	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
+    ;Load 4x8
+    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
+    lea     r1, [r1 + 2 * r2]
+    lea     r3, [r3 + 2 * r4]
     SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
-	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0
 
-	lea		r0, [r0+64]
-	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+    lea     r0, [r0+64]
+    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
-	POP_XMM
-	LOAD_5_PARA_POP
+    POP_XMM
+    LOAD_5_PARA_POP
     ret
 
 
@@ -337,59 +337,59 @@
 ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
 ;***********************************************************************
 WELS_EXTERN WelsIDctFourT4Rec_sse2
-	%assign push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	;Load 4x8
-	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    ;Load 4x8
+    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
-	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-  	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
-    SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
-    SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
+    SSE2_IDCT           xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
+    SSE2_IDCT           xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
-	WELS_Zero			xmm7
-    WELS_DW32			xmm6
+    WELS_Zero           xmm7
+    WELS_DW32           xmm6
 
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+    SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0      ],  [r2]
+    SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
+    lea     r0, [r0 + 2 * r1]
+    lea     r2, [r2 + 2 * r3]
+    SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],            [r2]
+    SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
 
-    add		r4, 64
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-   	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
+    add     r4, 64
+    lea     r0, [r0 + 2 * r1]
+    lea     r2, [r2 + 2 * r3]
+    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
-	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
+    SSE2_IDCT           xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
-	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+    SSE2_IDCT           xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
-	WELS_Zero			xmm7
-    WELS_DW32			xmm6
+    WELS_Zero           xmm7
+    WELS_DW32           xmm6
 
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
-	POP_XMM
-	LOAD_5_PARA_POP
-   ; pop		esi
-   ; pop		ebx
+    SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0      ],  [r2]
+    SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
+    lea     r0, [r0 + 2 * r1]
+    lea     r2, [r2 + 2 * r3]
+    SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],            [r2]
+    SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],   [r2 + r3]
+    POP_XMM
+    LOAD_5_PARA_POP
+    ; pop        esi
+    ; pop        ebx
     ret
 
 %macro SSE2_StoreDiff4x8p 8
-   	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
+    SSE2_StoreDiff8p    %1, %3, %4, [%5],           [%6]
+    SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],      [%6 + %8]
+    SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],       [%6 + 8]
+    SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],  [%6 + %8 + 8]
 %endmacro
 
  ;***********************************************************************
@@ -396,76 +396,76 @@
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
 WELS_EXTERN WelsIDctRecI16x16Dc_sse2
-	%assign push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor		xmm7,		xmm7
-    WELS_DW32	xmm6
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor        xmm7,       xmm7
+    WELS_DW32   xmm6
 
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-	POP_XMM
-	LOAD_5_PARA_POP
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    POP_XMM
+    LOAD_5_PARA_POP
     ret
 
 
 
 %macro SSE2_SumSubD 3
-	movdqa  %3, %2
+    movdqa  %3, %2
     paddd   %2, %1
     psubd   %1, %3
 %endmacro
 
 %macro SSE2_SumSubDiv2D 4
-	paddd   %1, %2
-	paddd	%1, %3
-	psrad	%1,	 1
-	movdqa	%4, %1
-	psubd	%4, %2
+    paddd   %1, %2
+    paddd   %1, %3
+    psrad   %1,  1
+    movdqa  %4, %1
+    psubd   %4, %2
 %endmacro
-%macro SSE2_Load4Col	5
-	movsx		r2,		WORD[%5]
- 	movd		%1,			r2d
- 	movsx		r2,		WORD[%5 + 0x20]
- 	movd		%2,			r2d
-	punpckldq	%1,			%2
-	movsx		r2,		WORD[%5 + 0x80]
- 	movd		%3,			r2d
-	movsx		r2,		WORD[%5 + 0xa0]
- 	movd		%4,			r2d
-	punpckldq	%3,			%4
-	punpcklqdq	%1,			%3
+%macro SSE2_Load4Col    5
+    movsx       r2,     WORD[%5]
+    movd        %1,         r2d
+    movsx       r2,     WORD[%5 + 0x20]
+    movd        %2,         r2d
+    punpckldq   %1,         %2
+    movsx       r2,     WORD[%5 + 0x80]
+    movd        %3,         r2d
+    movsx       r2,     WORD[%5 + 0xa0]
+    movd        %4,         r2d
+    punpckldq   %3,         %4
+    punpcklqdq  %1,         %3
 %endmacro
 
 ;***********************************************************************
@@ -472,33 +472,33 @@
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
 WELS_EXTERN WelsHadamardT4Dc_sse2
-		%assign push_num 0
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
-		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
-		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
-		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, r1 + 0x140
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SSE2_Load4Col       xmm1, xmm5, xmm6, xmm0, r1
+    SSE2_Load4Col       xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+    SSE2_Load4Col       xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+    SSE2_Load4Col       xmm4, xmm5, xmm6, xmm0, r1 + 0x140
 
-		SSE2_SumSubD		xmm1, xmm2, xmm7
-		SSE2_SumSubD		xmm3, xmm4, xmm7
-		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7
+    SSE2_SumSubD        xmm1, xmm2, xmm7
+    SSE2_SumSubD        xmm3, xmm4, xmm7
+    SSE2_SumSubD        xmm2, xmm4, xmm7
+    SSE2_SumSubD        xmm1, xmm3, xmm7
 
-		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
+    SSE2_Trans4x4D      xmm4, xmm2, xmm1, xmm3, xmm5    ; pOut: xmm4,xmm3,xmm5,xmm1
 
-		SSE2_SumSubD		xmm4, xmm3, xmm7
-		SSE2_SumSubD		xmm5, xmm1, xmm7
+    SSE2_SumSubD        xmm4, xmm3, xmm7
+    SSE2_SumSubD        xmm5, xmm1, xmm7
 
-		WELS_DD1 xmm6
-		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
-		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
-        SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
+    WELS_DD1 xmm6
+    SSE2_SumSubDiv2D    xmm3, xmm1, xmm6, xmm0          ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+    SSE2_SumSubDiv2D    xmm4, xmm5, xmm6, xmm1          ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+    SSE2_Trans4x4D      xmm3, xmm0, xmm1, xmm4, xmm2    ; pOut: xmm3,xmm4,xmm2,xmm1
 
-		packssdw	xmm3,	xmm4
-		packssdw	xmm2,	xmm1
-		movdqa	[r0+ 0],   xmm3
-		movdqa	[r0+16],   xmm2
+    packssdw    xmm3,   xmm4
+    packssdw    xmm2,   xmm1
+    movdqa  [r0+ 0],   xmm3
+    movdqa  [r0+16],   xmm2
 
-		POP_XMM
-		ret
+    POP_XMM
+    ret
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -61,7 +61,7 @@
 sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
 
 align 16
-mmx_01bytes:		times 16	db 1
+mmx_01bytes:        times 16    db 1
 
 align 16
 mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -73,106 +73,106 @@
 ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 ;%1 will keep the last result
 %macro SSE_DB_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubb %1, %2
+    pxor %1, %1
+    pcmpeqw %2, %2
+    psubb %1, %2
 %endmacro
 
 ;xmm0, xmm1, xmm2, eax, ecx
 ;lower 64 bits of xmm0 save the result
 %macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
+    movd        %1, [%4-1]
+    movdqa      %3, %1
+    punpcklbw   %1, %3
+    movdqa      %3, %1
+    punpcklbw   %1, %3
 
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
+    ;add            %4, %5
+    movd        %2, [%4+%5-1]
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    punpckldq   %1, %2
 %endmacro
 
 %macro SUMW_HORIZON1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 8
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    paddusw     %1, %2
 %endmacro
 
 %macro LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpcklwd %1,   %3
+    lea     %5, [%5+2*%6]
+    movd    %4, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %4,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    lea     %5, [%5+2*%6]
+    punpcklbw %3,   %2
+    punpcklwd %4,   %3
+    punpckhdq %1,   %4
 %endmacro
 
 %macro SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
+    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
+    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
+    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
+    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
+    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
+    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
 %endmacro
 
 
 %macro COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+    movdqa      %2, [%1-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
 %macro COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+    movdqa      %2, [%1+%3-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
 %macro LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,%2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpckhwd %1,   %3
+    lea     %5, [%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
-        lea         r1, [r1+2*r2]
-        movzx		r4, byte [r1-0x01]
-        add			r3, r4
-        movzx		r4, byte [r1+r2-0x01]
-        add			r3, r4
+    lea         r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]
+    add         r3, r4
+    movzx       r4, byte [r1+r2-0x01]
+    add         r3, r4
 %endmacro
 
 ;***********************************************************************
@@ -184,127 +184,127 @@
 ;***********************************************************************
 ;   void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;
-;	pred must align to 16
+;   pred must align to 16
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredH_sse2
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movzx		r3,	byte [r1-1]
-	movd		xmm0,	r3d
-	pmuludq		xmm0,	[mmx_01bytes]
+    push r3
+    %assign push_num 1
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movzx       r3, byte [r1-1]
+    movd        xmm0,   r3d
+    pmuludq     xmm0,   [mmx_01bytes]
 
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm1,	r3d
-	pmuludq		xmm1,	[mmx_01bytes]
+    movzx       r3, byte [r1+r2-1]
+    movd        xmm1,   r3d
+    pmuludq     xmm1,   [mmx_01bytes]
 
-	unpcklps	xmm0,	xmm1
+    unpcklps    xmm0,   xmm1
 
-	lea			r1,	[r1+r2*2]
-	movzx		r3,	byte [r1-1]
-	movd		xmm2,	r3d
-	pmuludq		xmm2,	[mmx_01bytes]
+    lea         r1, [r1+r2*2]
+    movzx       r3, byte [r1-1]
+    movd        xmm2,   r3d
+    pmuludq     xmm2,   [mmx_01bytes]
 
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm3,	r3d
-	pmuludq		xmm3,	[mmx_01bytes]
+    movzx       r3, byte [r1+r2-1]
+    movd        xmm3,   r3d
+    pmuludq     xmm3,   [mmx_01bytes]
 
-	unpcklps	xmm2,	xmm3
-	unpcklpd	xmm0,	xmm2
+    unpcklps    xmm2,   xmm3
+    unpcklpd    xmm0,   xmm2
 
-	movdqa		[r0],	xmm0
-	pop r3
-	ret
+    movdqa      [r0],   xmm0
+    pop r3
+    ret
 
 ;***********************************************************************
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r2, r2d
-		sub		r1,	1
-		sub		r1,	r2
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    sub     r1, 1
+    sub     r1, r2
 
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r1]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r1 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
+    ;for H
+    pxor    xmm7,   xmm7
+    movq    xmm0,   [r1]
+    movdqa  xmm5,   [sse2_plane_dec]
+    punpcklbw xmm0, xmm7
+    pmullw  xmm0,   xmm5
+    movq    xmm1,   [r1 + 9]
+    movdqa  xmm6,   [sse2_plane_inc]
+    punpcklbw xmm1, xmm7
+    pmullw  xmm1,   xmm6
+    psubw   xmm1,   xmm0
 
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r3d,    xmm1        ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+    movsx   r3, r3w
+    imul    r3, 5
+    add     r3, 32
+    sar     r3, 6           ; b = (5 * H + 32) >> 6;
+    SSE2_Copy8Times xmm1, r3d   ; xmm1 = b,b,b,b,b,b,b,b
 
-		movzx	r4,	BYTE [r1+16]
-		sub	r1, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
+    movzx   r4, BYTE [r1+16]
+    sub r1, 3
+    LOAD_COLUMN     xmm0, xmm2, xmm3, xmm4, r1, r2
 
-		add		r1,	3
-		movzx	r3,	BYTE [r1+8*r2]
-		add		r4,	r3
-		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
+    add     r1, 3
+    movzx   r3, BYTE [r1+8*r2]
+    add     r4, r3
+    shl     r4, 4           ;   a = (left[15*stride] + top[15]) << 4;
 
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
+    sub r1, 3
+    add     r1, r2
+    LOAD_COLUMN     xmm7, xmm2, xmm3, xmm4, r1, r2
+    pxor    xmm4,   xmm4
+    punpckhbw xmm0, xmm4
+    pmullw  xmm0,   xmm5
+    punpckhbw xmm7, xmm4
+    pmullw  xmm7,   xmm6
+    psubw   xmm7,   xmm0
 
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r3d,   xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
+    SUMW_HORIZON   xmm7,xmm0,xmm2
+    movd    r3d,   xmm7         ; V
+    movsx   r3, r3w
+    imul    r3, 5
+    add     r3, 32
+    sar     r3, 6               ; c = (5 * V + 32) >> 6;
+    SSE2_Copy8Times xmm4, r3d       ; xmm4 = c,c,c,c,c,c,c,c
 
-		add		r4,	16
-		imul	r3,	-7
-		add		r3,	r4				; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+    add     r4, 16
+    imul    r3, -7
+    add     r3, r4              ; s = a + 16 + (-7)*c
+    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_inc_minus]
+    xor     r3, r3
+    movdqa  xmm5,   [sse2_plane_inc_minus]
 
 get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	16
-		inc		r3
-		cmp		r3,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
-		POP_XMM
-		pop r4
-		pop r3
-		ret
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    movdqa  xmm3,   xmm1
+    pmullw  xmm3,   xmm6
+    paddw   xmm3,   xmm0
+    psraw   xmm3,   5
+    packuswb xmm2,  xmm3
+    movdqa  [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, 16
+    inc     r3
+    cmp     r3, 16
+    jnz get_i16x16_luma_pred_plane_sse2_1
+    POP_XMM
+    pop r4
+    pop r3
+    ret
 
 ;***********************************************************************
 ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
@@ -311,38 +311,38 @@
 ;***********************************************************************
 
 %macro SSE2_PRED_H_16X16_ONE_LINE 0
-	add r0, 16
-	add r1, r2
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
+    add r0, 16
+    add r1, r2
+    movzx r3, byte [r1]
+    SSE2_Copy16Times xmm0, r3d
+    movdqa [r0], xmm0
 %endmacro
 
 WELS_EXTERN WelsI16x16LumaPredH_sse2
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	dec r1
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	pop r3
+    push r3
+    %assign push_num 1
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    dec r1
+    movzx r3, byte [r1]
+    SSE2_Copy16Times xmm0, r3d
+    movdqa [r0], xmm0
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    pop r3
     ret
 
 ;***********************************************************************
@@ -378,289 +378,289 @@
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r2, r2d
-		sub		r1,	1
-		sub		r1,	r2
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    sub     r1, 1
+    sub     r1, r2
 
-		pxor	mm7,	mm7
-		movq	mm0,	[r1]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r1 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
+    pxor    mm7,    mm7
+    movq    mm0,    [r1]
+    movq    mm5,    [sse2_plane_dec_c]
+    punpcklbw mm0,  mm7
+    pmullw  mm0,    mm5
+    movq    mm1,    [r1 + 5]
+    movq    mm6,    [sse2_plane_inc_c]
+    punpcklbw mm1,  mm7
+    pmullw  mm1,    mm6
+    psubw   mm1,    mm0
 
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
+    movq2dq xmm1,   mm1
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r3d,    xmm1
+    movsx   r3, r3w
+    imul    r3, 17
+    add     r3, 16
+    sar     r3, 5           ; b = (17 * H + 16) >> 5;
+    SSE2_Copy8Times xmm1, r3d   ; mm1 = b,b,b,b,b,b,b,b
 
-		movzx	r3,	BYTE [r1+8]
-		sub	r1, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
+    movzx   r3, BYTE [r1+8]
+    sub r1, 3
+    LOAD_COLUMN_C   mm0, mm2, mm3, mm4, r1, r2
 
-		add		r1,	3
-		movzx	r4,	BYTE [r1+4*r2]
-		add		r4,	r3
-		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
+    add     r1, 3
+    movzx   r4, BYTE [r1+4*r2]
+    add     r4, r3
+    shl     r4, 4           ; a = (left[7*stride] + top[7]) << 4;
 
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
+    sub r1, 3
+    add     r1, r2
+    LOAD_COLUMN_C   mm7, mm2, mm3, mm4, r1, r2
+    pxor    mm4,    mm4
+    punpckhbw mm0,  mm4
+    pmullw  mm0,    mm5
+    punpckhbw mm7,  mm4
+    pmullw  mm7,    mm6
+    psubw   mm7,    mm0
 
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r3d,    xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
+    movq2dq xmm7,   mm7
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm7,xmm0,xmm2
+    movd    r3d,    xmm7            ; V
+    movsx   r3, r3w
+    imul    r3, 17
+    add     r3, 16
+    sar     r3, 5               ; c = (17 * V + 16) >> 5;
+    SSE2_Copy8Times xmm4, r3d   ; mm4 = c,c,c,c,c,c,c,c
 
-		add		r4,	16
-		imul	r3,	-3
-		add		r3,	r4		; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
+    add     r4, 16
+    imul    r3, -3
+    add     r3, r4      ; s = a + 16 + (-3)*c
+    SSE2_Copy8Times xmm0, r3d   ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
+    xor     r3, r3
+    movdqa  xmm5,   [sse2_plane_mul_b_c]
 
 get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	8
-		inc		r3
-		cmp		r3,	8
-		jnz get_i_chroma_pred_plane_sse2_1
-		POP_XMM
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    packuswb xmm2,  xmm2
+    movq    [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, 8
+    inc     r3
+    cmp     r3, 8
+    jnz get_i_chroma_pred_plane_sse2_1
+    POP_XMM
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pred[7] = ([6]+[0]*2+[1]+2)/4
+;   0 |1 |2 |3 |4 |
+;   6 |7 |8 |9 |10|
+;   11|12|13|14|15|
+;   16|17|18|19|20|
+;   21|22|23|24|25|
+;   7 is the start pixel of current 4x4 block
+;   pred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
-	sub		r1, r2			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
-	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movq        mm1,[r1+r2-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+    movq        mm2,[r1-8]          ;get value of 6 mm2[8] = 6
+    sub     r1, r2          ;mov eax to above line of current block(postion of 1)
+    punpckhbw   mm2,[r1-8]          ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+    movd        mm3,[r1]            ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+    punpckhwd   mm1,mm2             ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+    psllq       mm3,18h             ;mm3[5]=[1]
+    psrlq       mm1,28h             ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    por         mm3,mm1             ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+    movq        mm1,mm3             ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    lea         r1,[r1+r2*2-8h]     ;set eax point to 12
+    movq        mm4,[r1+r2]     ;get value of 16, mm4[8]=[16]
+    psllq       mm3,8               ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[16]
+    por         mm3,mm4             ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+    movq        mm2,mm3             ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+    movq        mm4,[r1+r2*2]       ;mm4[8]=[21]
+    psllq       mm3,8               ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[21]
+    por         mm3,mm4             ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+    movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+    pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
+    pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
+    pand        mm1,[mmx_01bytes]   ;set the odd bit
+    psubusb     mm3,mm1             ;decrease 1 from odd bytes
+    pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
-	movd        [r0+12],mm2
-	psrlq       mm2,8
-	movd        [r0+8],mm2
-	psrlq       mm2,8
-	movd        [r0+4],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
+    movd        [r0+12],mm2
+    psrlq       mm2,8
+    movd        [r0+8],mm2
+    psrlq       mm2,8
+    movd        [r0+4],mm2
+    psrlq       mm2,8
+    movd        [r0],mm2
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	5 |6 |7 |8 |9 |
-;	10|11|12|13|14|
-;	15|16|17|18|19|
-;	20|21|22|23|24|
-;	6 is the start pixel of current 4x4 block
-;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;   0 |1 |2 |3 |4 |
+;   5 |6 |7 |8 |9 |
+;   10|11|12|13|14|
+;   15|16|17|18|19|
+;   20|21|22|23|24|
+;   6 is the start pixel of current 4x4 block
+;   pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movzx		r4,	byte [r1-1h]
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pxor		xmm1,	xmm1
-	psadbw		xmm0,	xmm1
-	xor r3, r3
-	movd		r3d,	xmm0
-	add			r3,	r4
-	movzx		r4,	byte [r1+r2*2-1h]
-	add			r3,	r4
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movzx       r4, byte [r1-1h]
+    sub         r1, r2
+    movd        xmm0,   [r1]
+    pxor        xmm1,   xmm1
+    psadbw      xmm0,   xmm1
+    xor r3, r3
+    movd        r3d,    xmm0
+    add         r3, r4
+    movzx       r4, byte [r1+r2*2-1h]
+    add         r3, r4
 
-	lea			r1,	[r1+r2*2-1]
-	movzx		r4,	byte [r1+r2]
-	add			r3,	r4
+    lea         r1, [r1+r2*2-1]
+    movzx       r4, byte [r1+r2]
+    add         r3, r4
 
-	movzx		r4,	byte [r1+r2*2]
-	add			r3,	r4
-	add			r3,	4
-	sar			r3,	3
-	imul		r3,	0x01010101
+    movzx       r4, byte [r1+r2*2]
+    add         r3, r4
+    add         r3, 4
+    sar         r3, 3
+    imul        r3, 0x01010101
 
-	movd		xmm0,	r3d
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	pop r4
-	pop r3
-	ret
+    movd        xmm0,   r3d
+    pshufd      xmm0,   xmm0,   0
+    movdqa      [r0],   xmm0
+    pop r4
+    pop r3
+    ret
 
 ;***********************************************************************
-;	void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy 8 pixel of 8 line from left
 ;***********************************************************************
 %macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3-8]
+    psrlq       %1,     38h
 
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r2-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3+r2-8]
+    psrlq       %1,     38h
 
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 WELS_EXTERN WelsIChromaPredH_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movq		mm0,	[r1-8]
-	psrlq		mm0,	38h
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movq        mm0,    [r1-8]
+    psrlq       mm0,    38h
 
-	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
+    ;pmuludq        mm0,    [mmx_01bytes]       ;extend to 4 bytes
+    pmullw      mm0,        [mmx_01bytes]
+    pshufw      mm0,    mm0,    0
+    movq        [r0],   mm0
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+8
 
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
+    lea         r1,[r1+r2*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+24
 
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
+    lea         r1,[r1+r2*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+40
 
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
+    lea         r1,[r1+r2*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
-	WELSEMMS
-	ret
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+56
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy pixels from top 4 pixels
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredV_sse2
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	ret
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movd        xmm0,   [r1]
+    pshufd      xmm0,   xmm0,   0
+    movdqa      [r0],   xmm0
+    ret
 
 ;***********************************************************************
-;	void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy 8 pixels from top 8 pixels
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredV_sse2
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub		r1,		r2
-	movq		xmm0,		[r1]
-	movdqa		xmm1,		xmm0
-	punpcklqdq	xmm0,		xmm1
-	movdqa		[r0],		xmm0
-	movdqa		[r0+16],	xmm0
-	movdqa		[r0+32],	xmm0
-	movdqa		[r0+48],	xmm0
-	ret
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub     r1,     r2
+    movq        xmm0,       [r1]
+    movdqa      xmm1,       xmm0
+    punpcklqdq  xmm0,       xmm1
+    movdqa      [r0],       xmm0
+    movdqa      [r0+16],    xmm0
+    movdqa      [r0+32],    xmm0
+    movdqa      [r0+48],    xmm0
+    ret
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
+;   |a |b |c |d |
+;   |e |f |a |b |
+;   |g |h |e |f |
+;   |i |j |g |h |
 
 ;   a = (1 + lt + l0)>>1
 ;   e = (1 + l0 + l1)>>1
@@ -679,68 +679,68 @@
 ;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+    psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
 
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1+2*r2-4]
-	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+    movd        mm1, [r1+2*r2-4]
+    punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r1, [r1+2*r2]
+    movd        mm2, [r1+2*r2-4]
+    punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+    psrlq       mm2, 20h
+    pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
 
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
+    movq        mm1, mm0
+    psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+    movq        mm2, mm0
+    psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+    movq        mm3, mm2
+    movq        mm4, mm1
+    pavgb       mm1, mm0
 
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+    pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
 
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+    movq        mm4, mm0
+    pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+    punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
 
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+    psrlq       mm2, 20h
+    psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+    movq        mm4, mm3
+    psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+    pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
 
-	movd        [r0], mm2
-	movd        [r0+12], mm3
-	psrlq       mm3, 10h
-	movd        [r0+8], mm3
-	psrlq       mm3, 10h
-	movd        [r0+4], mm3
-	WELSEMMS
-	ret
+    movd        [r0], mm2
+    movd        [r0+12], mm3
+    psrlq       mm3, 10h
+    movd        [r0+8], mm3
+    psrlq       mm3, 10h
+    movd        [r0+4], mm3
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
+;   |a |b |c |d |
+;   |c |d |e |f |
+;   |e |f |g |g |
+;   |g |g |g |g |
 
 ;   a = (1 + l0 + l1)>>1
 ;   c = (1 + l1 + l2)>>1
@@ -756,70 +756,70 @@
 ;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movd        mm0, [r1-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1-4]            ; mm2[3] = l2
-	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movd        mm0, [r1-4]            ; mm0[3] = l0
+    punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
+    lea         r1, [r1+2*r2]
+    movd        mm2, [r1-4]            ; mm2[3] = l2
+    movd        mm4, [r1+r2-4]        ; mm4[3] = l3
+    punpcklbw   mm2, mm4
+    punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
 
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+    psrlq       mm4, 18h
+    psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+    psrlq       mm0, 8h
+    pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+    movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+    pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+    movq        mm5, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
+    pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm5, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+    pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
 
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
 
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+    punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
 
-	psrlq       mm4, 20h
-	movd        [r0+12], mm4
+    psrlq       mm4, 20h
+    movd        [r0+12], mm4
 
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+4], mm1
-	psrlq       mm1, 10h
-	movd        [r0+8], mm1
-	WELSEMMS
-	ret
+    movd        [r0], mm1
+    psrlq       mm1, 10h
+    movd        [r0+4], mm1
+    psrlq       mm1, 10h
+    movd        [r0+8], mm1
+    WELSEMMS
+    ret
 
 
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   l3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |i |a |b |c |
+;   |j |e |f |g |
 
 ;   a = (1 + lt + t0)>>1
 ;   b = (1 + t0 + t1)>>1
@@ -837,75 +837,75 @@
 ;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+    psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
 
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+    movd        mm1, [r1+2*r2-4]
+    punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r1, [r1+2*r2]
+    movq        mm2, [r1+r2-8]        ; mm2[7] = l2
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+    psrlq       mm2, 28h
+    pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+    movq        mm3, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
+    pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
+    movq        mm3, mm0
+    psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+    movq        mm2, mm3
 
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
+    psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+    movd        [r0], mm1
 
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+4], mm2
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+    movd        [r0+4], mm2
 
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+    movq        mm4, mm3
+    psllq       mm4, 20h
+    psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
 
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+    movq        mm5, mm3
+    psllq       mm5, 28h
+    psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
 
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+8], mm4
+    psllq       mm1, 8h
+    pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+    movd        [r0+8], mm4
 
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	movd        [r0+12], mm5
-	WELSEMMS
-	ret
+    psllq       mm2, 8h
+    pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+    movd        [r0+12], mm5
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
+;   |a |b |c |d |
+;   |b |c |d |e |
+;   |c |d |e |f |
+;   |d |e |f |g |
 
 ;   a = (2 + t0 + t2 + (t1<<1))>>2
 ;   b = (2 + t1 + t3 + (t2<<1))>>2
@@ -921,54 +921,54 @@
 ;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+    movq        mm3, mm0
+    psrlq       mm3, 38h
+    psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
 
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+    psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
 
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
+    movq        mm3, mm1
+    pavgb       mm1, mm2
+    pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+    pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
 
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+4], mm0
-	psrlq       mm0, 8h
-	movd        [r0+8], mm0
-	psrlq       mm0, 8h
-	movd        [r0+12], mm0
-	WELSEMMS
-	ret
+    psrlq       mm0, 8h
+    movd        [r0], mm0
+    psrlq       mm0, 8h
+    movd        [r0+4], mm0
+    psrlq       mm0, 8h
+    movd        [r0+8], mm0
+    psrlq       mm0, 8h
+    movd        [r0+12], mm0
+    WELSEMMS
+    ret
 
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |b |c |d |i |
+;   |f |g |h |j |
 
 ;   a = (1 + t0 + t1)>>1
 ;   b = (1 + t1 + t2)>>1
@@ -987,37 +987,37 @@
 ;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+    psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+    psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+    movq        mm3, mm1
+    pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
 
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
+    movq        mm4, mm2
+    pavgb       mm2, mm0
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+    pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
 
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+8], mm3
+    movd        [r0], mm3
+    psrlq       mm3, 8h
+    movd        [r0+8], mm3
 
-	movd        [r0+4], mm2
-	psrlq       mm2, 8h
-	movd        [r0+12], mm2
-	WELSEMMS
-	ret
+    movd        [r0+4], mm2
+    psrlq       mm2, 8h
+    movd        [r0+12], mm2
+    WELSEMMS
+    ret
 
 ;***********************************************************************
 ;
@@ -1024,88 +1024,88 @@
 ;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1]
 
-	movzx		r3, byte [r1+r2-0x01] ; l1
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l2
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l3
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l4
-	add		r3, r4
-	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
+    movzx       r3, byte [r1+r2-0x01] ; l1
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l2
+    add     r3, r4
+    movzx       r4, byte [r1+r2-0x01] ; l3
+    add     r3, r4
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l4
+    add     r3, r4
+    movd            mm1, r3d                 ; mm1 = l1+l2+l3+l4
 
-	movzx		r3, byte [r1+r2-0x01] ; l5
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l6
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l7
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l8
-	add		r3, r4
-	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
+    movzx       r3, byte [r1+r2-0x01] ; l5
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l6
+    add     r3, r4
+    movzx       r4, byte [r1+r2-0x01] ; l7
+    add     r3, r4
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l8
+    add     r3, r4
+    movd            mm2, r3d                 ; mm2 = l5+l6+l7+l8
 
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+    movq        mm3, mm0
+    psrlq       mm0, 0x20
+    psllq       mm3, 0x20
+    psrlq       mm3, 0x20
+    pxor        mm4, mm4
+    psadbw      mm0, mm4
+    psadbw      mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
 
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+    paddq       mm3, mm1
+    movq        mm1, mm2
+    paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-	movq        mm4, [mmx_0x02]
+    movq        mm4, [mmx_0x02]
 
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
+    paddq       mm0, mm4
+    psrlq       mm0, 0x02
 
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
+    paddq       mm2, mm4
+    psrlq       mm2, 0x02
 
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
+    paddq       mm3, mm4
+    paddq       mm3, mm4
+    psrlq       mm3, 0x03
 
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
+    paddq       mm1, mm4
+    paddq       mm1, mm4
+    psrlq       mm1, 0x03
 
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
+    pmuludq     mm0, [mmx_01bytes]
+    pmuludq     mm3, [mmx_01bytes]
+    psllq       mm0, 0x20
+    pxor        mm0, mm3                 ; mm0 = m_up
 
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
+    pmuludq     mm2, [mmx_01bytes]
+    pmuludq     mm1, [mmx_01bytes]
+    psllq       mm1, 0x20
+    pxor        mm1, mm2                 ; mm2 = m_down
 
-	movq        [r0], mm0
-	movq        [r0+0x08], mm0
-	movq        [r0+0x10], mm0
-	movq        [r0+0x18], mm0
+    movq        [r0], mm0
+    movq        [r0+0x08], mm0
+    movq        [r0+0x10], mm0
+    movq        [r0+0x18], mm0
 
-	movq        [r0+0x20], mm1
-	movq        [r0+0x28], mm1
-	movq        [r0+0x30], mm1
-	movq        [r0+0x38], mm1
+    movq        [r0+0x20], mm1
+    movq        [r0+0x28], mm1
+    movq        [r0+0x30], mm1
+    movq        [r0+0x38], mm1
 
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 
 
@@ -1114,56 +1114,56 @@
 ;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movdqa      xmm0, [r1]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movdqa      xmm0, [r1]             ; read one row
+    pxor        xmm1, xmm1
+    psadbw      xmm0, xmm1
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 0x08
+    pslldq      xmm0, 0x08
+    psrldq      xmm0, 0x08
+    paddw       xmm0, xmm1
 
-	movzx		r3, byte [r1+r2-0x01]
-	movzx		r4, byte [r1+2*r2-0x01]
-	add		r3, r4
-	lea         r1, [r1+r2]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r3, 0x10
-	movd        xmm1, r3d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
+    movzx       r3, byte [r1+r2-0x01]
+    movzx       r4, byte [r1+2*r2-0x01]
+    add     r3, r4
+    lea         r1, [r1+r2]
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    add         r3, 0x10
+    movd        xmm1, r3d
+    paddw       xmm0, xmm1
+    psrld       xmm0, 0x05
+    pmuludq     xmm0, [mmx_01bytes]
+    pshufd      xmm0, xmm0, 0
 
-	movdqa      [r0], xmm0
-	movdqa      [r0+0x10], xmm0
-	movdqa      [r0+0x20], xmm0
-	movdqa      [r0+0x30], xmm0
-	movdqa      [r0+0x40], xmm0
-	movdqa      [r0+0x50], xmm0
-	movdqa      [r0+0x60], xmm0
-	movdqa      [r0+0x70], xmm0
-	movdqa      [r0+0x80], xmm0
-	movdqa      [r0+0x90], xmm0
-	movdqa      [r0+0xa0], xmm0
-	movdqa      [r0+0xb0], xmm0
-	movdqa      [r0+0xc0], xmm0
-	movdqa      [r0+0xd0], xmm0
-	movdqa      [r0+0xe0], xmm0
-	movdqa      [r0+0xf0], xmm0
+    movdqa      [r0], xmm0
+    movdqa      [r0+0x10], xmm0
+    movdqa      [r0+0x20], xmm0
+    movdqa      [r0+0x30], xmm0
+    movdqa      [r0+0x40], xmm0
+    movdqa      [r0+0x50], xmm0
+    movdqa      [r0+0x60], xmm0
+    movdqa      [r0+0x70], xmm0
+    movdqa      [r0+0x80], xmm0
+    movdqa      [r0+0x90], xmm0
+    movdqa      [r0+0xa0], xmm0
+    movdqa      [r0+0xb0], xmm0
+    movdqa      [r0+0xc0], xmm0
+    movdqa      [r0+0xd0], xmm0
+    movdqa      [r0+0xe0], xmm0
+    movdqa      [r0+0xf0], xmm0
 
-	pop r4
-	pop r3
-	ret
\ No newline at end of file
+    pop r4
+    pop r3
+    ret
\ No newline at end of file
--- a/codec/encoder/core/x86/matrix_transpose.asm
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -34,153 +34,153 @@
 ;in:  m0, m1, m2, m3, m4, m5, m6, m7
 ;out: m0, m3, m5, m2, m7, m1, m6, m4
 %macro TRANSPOSE_8x8B_MMX 10
-	MMX_XSwap bw,  %1, %2, %8
-	MMX_XSwap bw,  %3, %4, %2
-	MMX_XSwap bw,  %5, %6, %4
-	movq	%6, %9
-	movq	%10, %4
-	MMX_XSwap bw,  %7, %6, %4
+    MMX_XSwap bw,  %1, %2, %8
+    MMX_XSwap bw,  %3, %4, %2
+    MMX_XSwap bw,  %5, %6, %4
+    movq    %6, %9
+    movq    %10, %4
+    MMX_XSwap bw,  %7, %6, %4
 
-	MMX_XSwap wd,  %1, %3, %6
-	MMX_XSwap wd,  %8, %2, %3
-	MMX_XSwap wd,  %5, %7, %2
-	movq	%7, %10
-	movq	%10, %3
-	MMX_XSwap wd,  %7, %4, %3
+    MMX_XSwap wd,  %1, %3, %6
+    MMX_XSwap wd,  %8, %2, %3
+    MMX_XSwap wd,  %5, %7, %2
+    movq    %7, %10
+    movq    %10, %3
+    MMX_XSwap wd,  %7, %4, %3
 
-	MMX_XSwap dq,  %1, %5, %4
-	MMX_XSwap dq,  %6, %2, %5
-	MMX_XSwap dq,  %8, %7, %2
-	movq	%7, %10
-	movq	%10, %5
-	MMX_XSwap dq,  %7, %3, %5
+    MMX_XSwap dq,  %1, %5, %4
+    MMX_XSwap dq,  %6, %2, %5
+    MMX_XSwap dq,  %8, %7, %2
+    movq    %7, %10
+    movq    %10, %5
+    MMX_XSwap dq,  %7, %3, %5
 
-	movq	%3, %10
+    movq    %3, %10
 %endmacro
 
 ;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_MMX 2	; dst, dst_stride
-	movq [%1], mm0			; result of line 1, x8 bytes
-	movq [%1+%2], mm3		; result of line 2
-	lea %1, [%1+2*%2]
-	movq [%1], mm5			; result of line 3
-	movq [%1+%2], mm2		; result of line 4
-	lea %1, [%1+2*%2]
-	movq [%1], mm7			; result of line 5
-	movq [%1+%2], mm1		; result of line 6
-	lea %1, [%1+2*%2]
-	movq [%1], mm6			; result of line 7
-	movq [%1+%2], mm4		; result of line 8
+%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
+    movq [%1], mm0          ; result of line 1, x8 bytes
+    movq [%1+%2], mm3       ; result of line 2
+    lea %1, [%1+2*%2]
+    movq [%1], mm5          ; result of line 3
+    movq [%1+%2], mm2       ; result of line 4
+    lea %1, [%1+2*%2]
+    movq [%1], mm7          ; result of line 5
+    movq [%1+%2], mm1       ; result of line 6
+    lea %1, [%1+2*%2]
+    movq [%1], mm6          ; result of line 7
+    movq [%1+%2], mm4       ; result of line 8
 %endmacro
 
 ;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_ALT_MMX 3	; dst, dst_stride, reg32
-	movq [%1], mm0			; result of line 1, x8 bytes
-	movq [%1+%2], mm3		; result of line 2
-	lea %3, [%1+2*%2]
-	movq [%3], mm5			; result of line 3
-	movq [%3+%2], mm2		; result of line 4
-	lea %3, [%3+2*%2]
-	movq [%3], mm7			; result of line 5
-	movq [%3+%2], mm1		; result of line 6
-	lea %3, [%3+2*%2]
-	movq [%3], mm6			; result of line 7
-	movq [%3+%2], mm4		; result of line 8
-%endmacro	; end of TRANSPOSE8x8_WRITE_ALT_MMX
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
+    movq [%1], mm0          ; result of line 1, x8 bytes
+    movq [%1+%2], mm3       ; result of line 2
+    lea %3, [%1+2*%2]
+    movq [%3], mm5          ; result of line 3
+    movq [%3+%2], mm2       ; result of line 4
+    lea %3, [%3+2*%2]
+    movq [%3], mm7          ; result of line 5
+    movq [%3+%2], mm1       ; result of line 6
+    lea %3, [%3+2*%2]
+    movq [%3], mm6          ; result of line 7
+    movq [%3+%2], mm4       ; result of line 8
+%endmacro   ; end of TRANSPOSE8x8_WRITE_ALT_MMX
 
 ; for transpose 16x8
 
 ;in:  m0, m1, m2, m3, m4, m5, m6, m7
 ;out: m4, m2, m3, m7, m5, m1, m6, m0
-%macro TRANSPOSE_8x16B_SSE2		10
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%10, %4
-	SSE2_XSawp bw,  %7, %6, %4
+%macro TRANSPOSE_8x16B_SSE2     10
+    SSE2_XSawp bw,  %1, %2, %8
+    SSE2_XSawp bw,  %3, %4, %2
+    SSE2_XSawp bw,  %5, %6, %4
+    movdqa  %6, %9
+    movdqa  %10, %4
+    SSE2_XSawp bw,  %7, %6, %4
 
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %10
-	movdqa	%10, %3
-	SSE2_XSawp wd,  %7, %4, %3
+    SSE2_XSawp wd,  %1, %3, %6
+    SSE2_XSawp wd,  %8, %2, %3
+    SSE2_XSawp wd,  %5, %7, %2
+    movdqa  %7, %10
+    movdqa  %10, %3
+    SSE2_XSawp wd,  %7, %4, %3
 
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %10
-	movdqa	%10, %5
-	SSE2_XSawp dq,  %7, %3, %5
+    SSE2_XSawp dq,  %1, %5, %4
+    SSE2_XSawp dq,  %6, %2, %5
+    SSE2_XSawp dq,  %8, %7, %2
+    movdqa  %7, %10
+    movdqa  %10, %5
+    SSE2_XSawp dq,  %7, %3, %5
 
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %10
-	movdqa	%10, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %10
-%endmacro	; end of TRANSPOSE_8x16B_SSE2
+    SSE2_XSawp qdq,  %1, %8, %3
+    SSE2_XSawp qdq,  %4, %2, %8
+    SSE2_XSawp qdq,  %6, %7, %2
+    movdqa  %7, %10
+    movdqa  %10, %1
+    SSE2_XSawp qdq,  %7, %5, %1
+    movdqa  %5, %10
+%endmacro   ; end of TRANSPOSE_8x16B_SSE2
 
 
-%macro TRANSPOSE8x16_WRITE_SSE2	2	; dst, dst_stride
-	movq [%1], xmm4			; result of line 1, x8 bytes
-	movq [%1+%2], xmm2		; result of line 2
-	lea %1, [%1+2*%2]
-	movq [%1], xmm3			; result of line 3
-	movq [%1+%2], xmm7		; result of line 4
+%macro TRANSPOSE8x16_WRITE_SSE2 2   ; dst, dst_stride
+    movq [%1], xmm4         ; result of line 1, x8 bytes
+    movq [%1+%2], xmm2      ; result of line 2
+    lea %1, [%1+2*%2]
+    movq [%1], xmm3         ; result of line 3
+    movq [%1+%2], xmm7      ; result of line 4
 
-	lea %1, [%1+2*%2]
-	movq [%1], xmm5			; result of line 5
-	movq [%1+%2], xmm1		; result of line 6
-	lea %1, [%1+2*%2]
-	movq [%1], xmm6			; result of line 7
-	movq [%1+%2], xmm0		; result of line 8
+    lea %1, [%1+2*%2]
+    movq [%1], xmm5         ; result of line 5
+    movq [%1+%2], xmm1      ; result of line 6
+    lea %1, [%1+2*%2]
+    movq [%1], xmm6         ; result of line 7
+    movq [%1+%2], xmm0      ; result of line 8
 
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm4		; result of line 9
-	movhpd [%1+%2], xmm2	; result of line 10
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm3		; result of line 11
-	movhpd [%1+%2], xmm7	; result of line 12
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm4       ; result of line 9
+    movhpd [%1+%2], xmm2    ; result of line 10
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm3       ; result of line 11
+    movhpd [%1+%2], xmm7    ; result of line 12
 
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm5		; result of line 13
-	movhpd [%1+%2], xmm1	; result of line 14
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm6		; result of line 15
-	movhpd [%1+%2], xmm0	; result of line 16
-%endmacro	; end of TRANSPOSE_WRITE_RESULT_SSE2
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm5       ; result of line 13
+    movhpd [%1+%2], xmm1    ; result of line 14
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm6       ; result of line 15
+    movhpd [%1+%2], xmm0    ; result of line 16
+%endmacro   ; end of TRANSPOSE_WRITE_RESULT_SSE2
 
-%macro TRANSPOSE8x16_WRITE_ALT_SSE2	3	; dst, dst_stride, reg32
-	movq [%1], xmm4			; result of line 1, x8 bytes
-	movq [%1+%2], xmm2		; result of line 2
-	lea %3, [%1+2*%2]
-	movq [%3], xmm3			; result of line 3
-	movq [%3+%2], xmm7		; result of line 4
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3   ; dst, dst_stride, reg32
+    movq [%1], xmm4         ; result of line 1, x8 bytes
+    movq [%1+%2], xmm2      ; result of line 2
+    lea %3, [%1+2*%2]
+    movq [%3], xmm3         ; result of line 3
+    movq [%3+%2], xmm7      ; result of line 4
 
-	lea %3, [%3+2*%2]
-	movq [%3], xmm5			; result of line 5
-	movq [%3+%2], xmm1		; result of line 6
-	lea %3, [%3+2*%2]
-	movq [%3], xmm6			; result of line 7
-	movq [%3+%2], xmm0		; result of line 8
+    lea %3, [%3+2*%2]
+    movq [%3], xmm5         ; result of line 5
+    movq [%3+%2], xmm1      ; result of line 6
+    lea %3, [%3+2*%2]
+    movq [%3], xmm6         ; result of line 7
+    movq [%3+%2], xmm0      ; result of line 8
 
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm4		; result of line 9
-	movhpd [%3+%2], xmm2	; result of line 10
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm3		; result of line 11
-	movhpd [%3+%2], xmm7	; result of line 12
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm4       ; result of line 9
+    movhpd [%3+%2], xmm2    ; result of line 10
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm3       ; result of line 11
+    movhpd [%3+%2], xmm7    ; result of line 12
 
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm5		; result of line 13
-	movhpd [%3+%2], xmm1	; result of line 14
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm6		; result of line 15
-	movhpd [%3+%2], xmm0	; result of line 16
-%endmacro	; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm5       ; result of line 13
+    movhpd [%3+%2], xmm1    ; result of line 14
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm6       ; result of line 15
+    movhpd [%3+%2], xmm0    ; result of line 16
+%endmacro   ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
 
 
 SECTION .text
@@ -187,209 +187,209 @@
 
 WELS_EXTERN TransposeMatrixBlock16x16_sse2
 ; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
-	push r4
-	push r5
-	%assign push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
+    push r4
+    push r5
+    %assign push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
 
-	mov r4, r7
-	and r4, 0Fh
-	sub r7, 10h
-	sub r7, r4
-	lea r5, [r3+r3*2]
-	; top 8x16 block
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+r3*2]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+r3*4]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+r3*2]
+    mov r4, r7
+    and r4, 0Fh
+    sub r7, 10h
+    sub r7, r4
+    lea r5, [r3+r3*2]
+    ; top 8x16 block
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+r3*2]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+r3*4]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+r3*2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
 
-	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+    TRANSPOSE8x16_WRITE_SSE2        r0, r1
 
-	; bottom 8x16 block
-	lea	r2, [r2+r3*4]
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+r3*2]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+r3*4]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+r3*2]
+    ; bottom 8x16 block
+    lea r2, [r2+r3*4]
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+r3*2]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+r3*4]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+r3*2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
 
-	mov r5, r1
-	sal r5, 4
-	sub r0, r5
-	lea r0, [r0+r1*2+8]
-	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+    mov r5, r1
+    sal r5, 4
+    sub r0, r5
+    lea r0, [r0+r1*2+8]
+    TRANSPOSE8x16_WRITE_SSE2        r0, r1
 
-	add r7, r4
-	add r7, 10h
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    add r7, r4
+    add r7, 10h
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 WELS_EXTERN TransposeMatrixBlocksx16_sse2
 ; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
-	push r5
-	push r6
-	%assign push_num 2
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION  r1, r1d
-	SIGN_EXTENSION  r3, r3d
-	SIGN_EXTENSION  r4, r4d
-	mov r5, r7
-	and r5, 0Fh
-	sub r7, 10h
-	sub r7, r5
+    push r5
+    push r6
+    %assign push_num 2
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    mov r5, r7
+    and r5, 0Fh
+    sub r7, 10h
+    sub r7, r5
 TRANSPOSE_LOOP_SSE2:
-	; explictly loading next loop data
-	lea	r6, [r2+r3*8]
-	push r4
+    ; explictly loading next loop data
+    lea r6, [r2+r3*8]
+    push r4
 %rep 8
-	mov	r4, [r6]
-	mov	r4, [r6+r3]
-	lea	r6, [r6+r3*2]
+    mov r4, [r6]
+    mov r4, [r6+r3]
+    lea r6, [r6+r3*2]
 %endrep
-	pop r4
-	; top 8x16 block
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm2, [r2]
-	movdqa xmm3, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm6, [r2]
+    pop r4
+    ; top 8x16 block
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm2, [r2]
+    movdqa xmm3, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
-	TRANSPOSE8x16_WRITE_ALT_SSE2		r0, r1, r6
-	lea	r2, [r2+r3*2]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+    TRANSPOSE8x16_WRITE_ALT_SSE2        r0, r1, r6
+    lea r2, [r2+r3*2]
 
-	; bottom 8x16 block
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	lea	r2, [r2+r3*2]
-	movdqa xmm2, [r2]
-	movdqa xmm3, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	lea	r2, [r2+r3*2]
-	movdqa xmm6, [r2]
+    ; bottom 8x16 block
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm2, [r2]
+    movdqa xmm3, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
-	TRANSPOSE8x16_WRITE_ALT_SSE2		r0+8, r1, r6
-	lea	r2, [r2+r3*2]
-	lea r0, [r0+16]
-	dec r4
-	jg near TRANSPOSE_LOOP_SSE2
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+    TRANSPOSE8x16_WRITE_ALT_SSE2        r0+8, r1, r6
+    lea r2, [r2+r3*2]
+    lea r0, [r0+16]
+    dec r4
+    jg near TRANSPOSE_LOOP_SSE2
 
-	add r7, r5
-	add r7, 10h
-	POP_XMM
-	LOAD_5_PARA_POP
-	pop r6
-	pop r5
-	ret
+    add r7, r5
+    add r7, 10h
+    POP_XMM
+    LOAD_5_PARA_POP
+    pop r6
+    pop r5
+    ret
 
 WELS_EXTERN TransposeMatrixBlock8x8_mmx
 ; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
-	%assign push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION  r1, r1d
-	SIGN_EXTENSION  r3, r3d
-	sub	r7, 8
+    %assign push_num 0
+    LOAD_4_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub r7, 8
 
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m0, m3, m5, m2, m7, m1, m6, m4
-	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m0, m3, m5, m2, m7, m1, m6, m4
+    TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
 
-	TRANSPOSE8x8_WRITE_MMX r0, r1
+    TRANSPOSE8x8_WRITE_MMX r0, r1
 
-	emms
-	add r7, 8
-	LOAD_4_PARA_POP
-	ret
+    emms
+    add r7, 8
+    LOAD_4_PARA_POP
+    ret
 
 WELS_EXTERN TransposeMatrixBlocksx8_mmx
 ; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
-	push r5
-	push r6
-	%assign push_num 2
-	LOAD_5_PARA
-	SIGN_EXTENSION  r1, r1d
-	SIGN_EXTENSION  r3, r3d
-	SIGN_EXTENSION  r4, r4d
-	sub	r7, 8
+    push r5
+    push r6
+    %assign push_num 2
+    LOAD_5_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    sub r7, 8
 
-	lea	r5, [r2+r3*8]
+    lea r5, [r2+r3*8]
 
 TRANSPOSE_BLOCKS_X8_LOOP_MMX:
-	; explictly loading next loop data
+    ; explictly loading next loop data
 %rep 4
-	mov r6, [r5]
-	mov r6, [r5+r3]
-	lea	r5, [r5+r3*2]
+    mov r6, [r5]
+    mov r6, [r5+r3]
+    lea r5, [r5+r3*2]
 %endrep
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m0, m3, m5, m2, m7, m1, m6, m4
-	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m0, m3, m5, m2, m7, m1, m6, m4
+    TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
 
-	TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
-	lea r0, [r0+8]
-	lea r2, [r2+2*r3]
-	dec r4
-	jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+    TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+    lea r0, [r0+8]
+    lea r2, [r2+2*r3]
+    dec r4
+    jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
 
-	emms
-	add r7, 8
-	LOAD_5_PARA_POP
-	pop r6
-	pop r5
-	ret
+    emms
+    add r7, 8
+    LOAD_5_PARA_POP
+    pop r6
+    pop r5
+    ret
--- a/codec/encoder/core/x86/memzero.asm
+++ b/codec/encoder/core/x86/memzero.asm
@@ -51,10 +51,10 @@
 ;void WelsPrefetchZero_mmx(int8_t const*_A);
 ;***********************************************************************
 WELS_EXTERN WelsPrefetchZero_mmx
-	%assign  push_num 0
-	LOAD_1_PARA
-	prefetchnta [r0]
-	ret
+    %assign  push_num 0
+    LOAD_1_PARA
+    prefetchnta [r0]
+    ret
 
 
 ;***********************************************************************
@@ -62,23 +62,23 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroAligned64_sse2
 
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    neg     r1
 
-		pxor	xmm0,		xmm0
+    pxor    xmm0,       xmm0
 .memzeroa64_sse2_loops:
-		movdqa	[r0],		xmm0
-		movdqa	[r0+16],	xmm0
-		movdqa	[r0+32],	xmm0
-		movdqa	[r0+48],	xmm0
-		add		r0, 0x40
+    movdqa  [r0],       xmm0
+    movdqa  [r0+16],    xmm0
+    movdqa  [r0+32],    xmm0
+    movdqa  [r0+48],    xmm0
+    add     r0, 0x40
 
-		add r1, 0x40
-		jnz near .memzeroa64_sse2_loops
+    add r1, 0x40
+    jnz near .memzeroa64_sse2_loops
 
-		ret
+    ret
 
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -85,28 +85,28 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize64_mmx
 
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    neg     r1
 
-		pxor	mm0,		mm0
+    pxor    mm0,        mm0
 .memzero64_mmx_loops:
-		movq	[r0],		mm0
-		movq	[r0+8],	mm0
-		movq	[r0+16],	mm0
-		movq	[r0+24],	mm0
-		movq	[r0+32],	mm0
-		movq	[r0+40],	mm0
-		movq	[r0+48],	mm0
-		movq	[r0+56],	mm0
-		add		r0,		0x40
+    movq    [r0],       mm0
+    movq    [r0+8], mm0
+    movq    [r0+16],    mm0
+    movq    [r0+24],    mm0
+    movq    [r0+32],    mm0
+    movq    [r0+40],    mm0
+    movq    [r0+48],    mm0
+    movq    [r0+56],    mm0
+    add     r0,     0x40
 
-		add r1, 0x40
-		jnz near .memzero64_mmx_loops
+    add r1, 0x40
+    jnz near .memzero64_mmx_loops
 
-		WELSEMMS
-		ret
+    WELSEMMS
+    ret
 
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
@@ -113,20 +113,20 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize8_mmx
 
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
-		pxor	mm0,		mm0
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    neg     r1
+    pxor    mm0,        mm0
 
 .memzero8_mmx_loops:
-		movq	[r0],		mm0
-		add		r0,		0x08
+    movq    [r0],       mm0
+    add     r0,     0x08
 
-		add		r1,		0x08
-		jnz near .memzero8_mmx_loops
+    add     r1,     0x08
+    jnz near .memzero8_mmx_loops
 
-		WELSEMMS
-		ret
+    WELSEMMS
+    ret
 
 
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -49,140 +49,140 @@
 ;************************************************
 
 %macro SSE2_Quant8  5
-		MOVDQ	%1, %5
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pxor	%1, %2
-		psubw	%1, %2
-		MOVDQ	%5, %1
+    MOVDQ   %1, %5
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    paddusw %1, %3
+    pmulhuw %1, %4
+    pxor    %1, %2
+    psubw   %1, %2
+    MOVDQ   %5, %1
 %endmacro
 
 %macro SSE2_QuantMax8  6
-		MOVDQ	%1, %5
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pmaxsw	%6, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		MOVDQ	%5, %1
+    MOVDQ   %1, %5
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    paddusw %1, %3
+    pmulhuw %1, %4
+    pmaxsw  %6, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    MOVDQ   %5, %1
 %endmacro
 
-%define pDct				esp + 4
-%define ff					esp + 8
-%define mf					esp + 12
-%define max					esp + 16
+%define pDct                esp + 4
+%define ff                  esp + 8
+%define mf                  esp + 12
+%define max                 esp + 16
 ;***********************************************************************
-;	void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;   void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuant4x4_sse2
-		%assign push_num 0
-                LOAD_3_PARA
-		movdqa	xmm2, [r1]
-		movdqa	xmm3, [r2]
+    %assign push_num 0
+    LOAD_3_PARA
+    movdqa  xmm2, [r1]
+    movdqa  xmm3, [r2]
 
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
 
-		ret
+    ret
 
 ;***********************************************************************
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuant4x4Dc_sse2
- 		%assign push_num 0
-		LOAD_3_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		SSE2_Copy8Times xmm3, r2d
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSIONW r1, r1w
+    SIGN_EXTENSIONW r2, r2w
+    SSE2_Copy8Times xmm3, r2d
 
-		SSE2_Copy8Times xmm2, r1d
+    SSE2_Copy8Times xmm2, r1d
 
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
 
-		ret
+    ret
 
 ;***********************************************************************
-;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;   void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4_sse2
-		%assign push_num 0
-		LOAD_3_PARA
-		MOVDQ	xmm2, [r1]
-		MOVDQ	xmm3, [r2]
+    %assign push_num 0
+    LOAD_3_PARA
+    MOVDQ   xmm2, [r1]
+    MOVDQ   xmm3, [r2]
 
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
 
-		ret
+    ret
 
 ;***********************************************************************
-;	void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
+;   void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4Max_sse2
-		%assign push_num 0
-		LOAD_4_PARA
-		PUSH_XMM 8
-		MOVDQ	xmm2, [r1]
-		MOVDQ	xmm3, [r2]
+    %assign push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    MOVDQ   xmm2, [r1]
+    MOVDQ   xmm3, [r2]
 
-		pxor	xmm4, xmm4
-		pxor	xmm5, xmm5
-		pxor	xmm6, xmm6
-		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0	  ], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
+    pxor    xmm4, xmm4
+    pxor    xmm5, xmm5
+    pxor    xmm6, xmm6
+    pxor    xmm7, xmm7
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0   ], xmm4
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
 
-		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4
-		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7
-		movdqa	xmm1,  xmm0
-		punpckhqdq	xmm0, xmm1
-		pmaxsw	xmm0, xmm1
+    SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+    pmaxsw  xmm0,  xmm4
+    pmaxsw  xmm0,  xmm5
+    pmaxsw  xmm0,  xmm7
+    movdqa  xmm1,  xmm0
+    punpckhqdq  xmm0, xmm1
+    pmaxsw  xmm0, xmm1
 
-		movq	[r3], xmm0
-		POP_XMM
-		LOAD_4_PARA_POP
-		ret
+    movq    [r3], xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 %macro MMX_Copy4Times 2
-		movd		%1, %2
-		punpcklwd	%1, %1
-		punpckldq	%1,	%1
+    movd        %1, %2
+    punpcklwd   %1, %1
+    punpckldq   %1, %1
 %endmacro
 
 SECTION .text
 
 %macro MMX_Quant4  4
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pxor	%1, %2
-		psubw	%1, %2
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    paddusw %1, %3
+    pmulhuw %1, %4
+    pxor    %1, %2
+    psubw   %1, %2
 %endmacro
 
 ;***********************************************************************
@@ -189,101 +189,101 @@
 ;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
 ;***********************************************************************
 WELS_EXTERN WelsHadamardQuant2x2_mmx
-		%assign push_num 0
-		LOAD_5_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		movd		mm0,			[r0]
-		movd		mm1,			[r0 + 0x20]
-		punpcklwd	mm0,			mm1
-		movd		mm3,			[r0 + 0x40]
-		movd		mm1,			[r0 + 0x60]
-		punpcklwd	mm3,			mm1
+    %assign push_num 0
+    LOAD_5_PARA
+    SIGN_EXTENSIONW r1, r1w
+    SIGN_EXTENSIONW r2, r2w
+    movd        mm0,            [r0]
+    movd        mm1,            [r0 + 0x20]
+    punpcklwd   mm0,            mm1
+    movd        mm3,            [r0 + 0x40]
+    movd        mm1,            [r0 + 0x60]
+    punpcklwd   mm3,            mm1
 
-		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
-		movq		mm5,			mm3
-		paddw		mm3,			mm0
-		psubw		mm0,			mm5
-		punpcklwd	mm3,			mm0
-		movq		mm1,			mm3
-		psrlq		mm1,			32
-		movq		mm5,			mm1
-		paddw		mm1,			mm3
-		psubw		mm3,			mm5
-		punpcklwd	mm1,			mm3
+    ;hdm_2x2,   mm0 = dct0 dct1, mm3 = dct2 dct3
+    movq        mm5,            mm3
+    paddw       mm3,            mm0
+    psubw       mm0,            mm5
+    punpcklwd   mm3,            mm0
+    movq        mm1,            mm3
+    psrlq       mm1,            32
+    movq        mm5,            mm1
+    paddw       mm1,            mm3
+    psubw       mm3,            mm5
+    punpcklwd   mm1,            mm3
 
-		;quant_2x2_dc
-		MMX_Copy4Times	mm3,		r2d
-		MMX_Copy4Times	mm2,		r1d
-		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+    ;quant_2x2_dc
+    MMX_Copy4Times  mm3,        r2d
+    MMX_Copy4Times  mm2,        r1d
+    MMX_Quant4      mm1,    mm0,    mm2,    mm3
 
-		; store dct_2x2
-		movq		[r3],			mm1
-		movq		[r4],			mm1
+    ; store dct_2x2
+    movq        [r3],           mm1
+    movq        [r4],           mm1
 
-		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF
-		pxor		mm3,			mm3
-		packsswb	mm1,			mm3
-		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
-		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
-		psadbw		mm1,			mm3		;
-		mov			r1w,				0
-		mov			[r0],			r1w
-		mov			[r0 + 0x20],	r1w
-		mov			[r0 + 0x40],	r1w
-		mov			[r0 + 0x60],	r1w
+    ; pNonZeroCount of dct_2x2
+    pcmpeqb     mm2,            mm2     ; mm2 = FF
+    pxor        mm3,            mm3
+    packsswb    mm1,            mm3
+    pcmpeqb     mm1,            mm3     ; set FF if equal, 0 if not equal
+    psubsb      mm1,            mm2     ; set 0 if equal, 1 if not equal
+    psadbw      mm1,            mm3     ;
+    mov         r1w,                0
+    mov         [r0],           r1w
+    mov         [r0 + 0x20],    r1w
+    mov         [r0 + 0x40],    r1w
+    mov         [r0 + 0x60],    r1w
 
 
-		movd		retrd,		mm1
+    movd        retrd,      mm1
 
-		WELSEMMS
-		LOAD_5_PARA_POP
-		ret
+    WELSEMMS
+    LOAD_5_PARA_POP
+    ret
 
 ;***********************************************************************
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
-		%assign push_num 0
-		LOAD_3_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		movd		mm0,			[r0]
-		movd		mm1,			[r0 + 0x20]
-		punpcklwd	mm0,			mm1
-		movd		mm3,			[r0 + 0x40]
-		movd		mm1,			[r0 + 0x60]
-		punpcklwd	mm3,			mm1
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSIONW r1, r1w
+    SIGN_EXTENSIONW r2, r2w
+    movd        mm0,            [r0]
+    movd        mm1,            [r0 + 0x20]
+    punpcklwd   mm0,            mm1
+    movd        mm3,            [r0 + 0x40]
+    movd        mm1,            [r0 + 0x60]
+    punpcklwd   mm3,            mm1
 
-		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
-		movq		mm5,			mm3
-		paddw		mm3,			mm0
-		psubw		mm0,			mm5
-		punpcklwd	mm3,			mm0
-		movq		mm1,			mm3
-		psrlq		mm1,			32
-		movq		mm5,			mm1
-		paddw		mm1,			mm3
-		psubw		mm3,			mm5
-		punpcklwd	mm1,			mm3
+    ;hdm_2x2,   mm0 = dct0 dct1, mm3 = dct2 dct3
+    movq        mm5,            mm3
+    paddw       mm3,            mm0
+    psubw       mm0,            mm5
+    punpcklwd   mm3,            mm0
+    movq        mm1,            mm3
+    psrlq       mm1,            32
+    movq        mm5,            mm1
+    paddw       mm1,            mm3
+    psubw       mm3,            mm5
+    punpcklwd   mm1,            mm3
 
-		;quant_2x2_dc
-		MMX_Copy4Times	mm3,		r2d
-		MMX_Copy4Times	mm2,		r1d
-		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+    ;quant_2x2_dc
+    MMX_Copy4Times  mm3,        r2d
+    MMX_Copy4Times  mm2,        r1d
+    MMX_Quant4      mm1,    mm0,    mm2,    mm3
 
-		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF
-		pxor		mm3,			mm3
-		packsswb	mm1,			mm3
-		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
-		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
-		psadbw		mm1,			mm3		;
-		movd		retrd,			mm1
+    ; pNonZeroCount of dct_2x2
+    pcmpeqb     mm2,            mm2     ; mm2 = FF
+    pxor        mm3,            mm3
+    packsswb    mm1,            mm3
+    pcmpeqb     mm1,            mm3     ; set FF if equal, 0 if not equal
+    psubsb      mm1,            mm2     ; set 0 if equal, 1 if not equal
+    psadbw      mm1,            mm3     ;
+    movd        retrd,          mm1
 
-		WELSEMMS
-		ret
+    WELSEMMS
+    ret
 
 
 %macro SSE2_DeQuant8 3
@@ -297,12 +297,12 @@
 ; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************
 WELS_EXTERN WelsDequant4x4_sse2
-	%assign push_num 0
-	LOAD_2_PARA
+    %assign push_num 0
+    LOAD_2_PARA
 
-	movdqa  xmm1, [r1]
-	SSE2_DeQuant8 [r0	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
+    movdqa  xmm1, [r1]
+    SSE2_DeQuant8 [r0   ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
 
     ret
 
@@ -311,18 +311,18 @@
 ;***********************************************************************====
 
 WELS_EXTERN WelsDequantFour4x4_sse2
-	%assign push_num 0
-	LOAD_2_PARA
+    %assign push_num 0
+    LOAD_2_PARA
 
-	movdqa  xmm1, [r1]
-	SSE2_DeQuant8 [r0	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x10	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x20	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x30	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x40	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x50	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x60	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x70	],  xmm0, xmm1
+    movdqa  xmm1, [r1]
+    SSE2_DeQuant8 [r0   ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x10  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x20  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x30  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x40  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x50  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x60  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x70  ],  xmm0, xmm1
 
     ret
 
@@ -330,41 +330,41 @@
 ;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
-		%assign push_num 0
-		LOAD_2_PARA
-		%ifndef X86_32
-		movzx r1, r1w
-		%endif
+    %assign push_num 0
+    LOAD_2_PARA
+    %ifndef X86_32
+    movzx r1, r1w
+    %endif
 
-		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		r1d
-		;psrlw		xmm1,		2		; for the (>>2) in ihdm
-		MOVDQ		xmm0,		[r0]
-		MOVDQ		xmm2,		[r0+0x10]
-		pmullw		xmm0,		xmm1
-		pmullw		xmm2,		xmm1
+    ; WelsDequantLumaDc4x4
+    SSE2_Copy8Times xmm1,       r1d
+    ;psrlw      xmm1,       2       ; for the (>>2) in ihdm
+    MOVDQ       xmm0,       [r0]
+    MOVDQ       xmm2,       [r0+0x10]
+    pmullw      xmm0,       xmm1
+    pmullw      xmm2,       xmm1
 
-		; ihdm_4x4
-		movdqa		xmm1,		xmm0
-		psrldq		xmm1,		8
-		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8
+    ; ihdm_4x4
+    movdqa      xmm1,       xmm0
+    psrldq      xmm1,       8
+    movdqa      xmm3,       xmm2
+    psrldq      xmm3,       8
 
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
-		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
-		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+    SSE2_SumSub     xmm0, xmm3, xmm5                    ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+    SSE2_SumSub     xmm1, xmm2, xmm5                    ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+    SSE2_SumSub     xmm3, xmm2, xmm5                    ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+    SSE2_SumSub     xmm0, xmm1, xmm5                    ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
 
-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
-		SSE2_SumSub		xmm2, xmm4,	xmm5
-		SSE2_SumSub		xmm1, xmm0, xmm5
-		SSE2_SumSub		xmm4, xmm0, xmm5
-		SSE2_SumSub		xmm2, xmm1, xmm5
-		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
+    SSE2_TransTwo4x4W   xmm2, xmm1, xmm3, xmm0, xmm4
+    SSE2_SumSub     xmm2, xmm4, xmm5
+    SSE2_SumSub     xmm1, xmm0, xmm5
+    SSE2_SumSub     xmm4, xmm0, xmm5
+    SSE2_SumSub     xmm2, xmm1, xmm5
+    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
 
-		punpcklqdq	xmm0,		xmm1
-		MOVDQ		[r0],		xmm0
+    punpcklqdq  xmm0,       xmm1
+    MOVDQ       [r0],       xmm0
 
-		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[r0+16],	xmm2
-		ret
+    punpcklqdq  xmm2,       xmm3
+    MOVDQ       [r0+16],    xmm2
+    ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -35,123 +35,123 @@
 
 ;**********************************************************************************************************************************
 ;
-;	uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+;   uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
 ;
-;	\note:
-;		src need align with 16 bytes, ref is optional
-;	\return value:
-;		return minimal SAD cost, according index carried by index_min_cost
+;   \note:
+;       src need align with 16 bytes, ref is optional
+;   \return value:
+;       return minimal SAD cost, according index carried by index_min_cost
 ;**********************************************************************************************************************************
 ; try 8 mv via offset
 ; xmm7 store sad costs
-%macro SAD_16x16_LINE_SSE41  4	; src, ref, stride_src, stride_ref
-    movdqa		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqu		xmm2, [%2+8h]
-    movdqa		xmm3, xmm1
-    movdqa		xmm4, xmm2
+%macro SAD_16x16_LINE_SSE41  4  ; src, ref, stride_src, stride_ref
+    movdqa      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqu      xmm2, [%2+8h]
+    movdqa      xmm3, xmm1
+    movdqa      xmm4, xmm2
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm3, xmm0, 5	; 101 B
-    paddw		xmm7, xmm3		; accumulate cost
+    mpsadbw     xmm3, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm3      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 2	; 010 B
-    paddw		xmm7, xmm2		; accumulate cost
+    mpsadbw     xmm2, xmm0, 2   ; 010 B
+    paddw       xmm7, xmm2      ; accumulate cost
 
-    mpsadbw		xmm4, xmm0, 7	; 111 B
-    paddw		xmm7, xmm4		; accumulate cost
+    mpsadbw     xmm4, xmm0, 7   ; 111 B
+    paddw       xmm7, xmm4      ; accumulate cost
 
-    add			%1, %3
-    add			%2, %4
-%endmacro	; end of SAD_16x16_LINE_SSE41
-%macro SAD_16x16_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
-    movdqa		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqu		xmm2, [%2+8h]
-    movdqa		xmm3, xmm1
-    movdqa		xmm4, xmm2
+    add         %1, %3
+    add         %2, %4
+%endmacro   ; end of SAD_16x16_LINE_SSE41
+%macro SAD_16x16_LINE_SSE41E  4 ; src, ref, stride_src, stride_ref
+    movdqa      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqu      xmm2, [%2+8h]
+    movdqa      xmm3, xmm1
+    movdqa      xmm4, xmm2
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm3, xmm0, 5	; 101 B
-    paddw		xmm7, xmm3		; accumulate cost
+    mpsadbw     xmm3, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm3      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 2	; 010 B
-    paddw		xmm7, xmm2		; accumulate cost
+    mpsadbw     xmm2, xmm0, 2   ; 010 B
+    paddw       xmm7, xmm2      ; accumulate cost
 
-    mpsadbw		xmm4, xmm0, 7	; 111 B
-    paddw		xmm7, xmm4		; accumulate cost
-%endmacro	; end of SAD_16x16_LINE_SSE41E
+    mpsadbw     xmm4, xmm0, 7   ; 111 B
+    paddw       xmm7, xmm4      ; accumulate cost
+%endmacro   ; end of SAD_16x16_LINE_SSE41E
 
 WELS_EXTERN SampleSad16x16Hor8_sse41
     ;push ebx
     ;push esi
-    ;mov eax, [esp+12]	;   src
-    ;mov ecx, [esp+16]	;   stride_src
-    ;mov ebx, [esp+20]	;   ref
-    ;mov edx, [esp+24]	;   stride_ref
-    ;mov esi, [esp+28]	;   base_cost
+    ;mov eax, [esp+12]  ;   src
+    ;mov ecx, [esp+16]  ;   stride_src
+    ;mov ebx, [esp+20]  ;   ref
+    ;mov edx, [esp+24]  ;   stride_ref
+    ;mov esi, [esp+28]  ;   base_cost
     %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-    SIGN_EXTENSION	r1, r1d
-    SIGN_EXTENSION	r3, r3d
-    pxor	xmm7,	xmm7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    pxor    xmm7,   xmm7
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41E	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41E   r0, r2, r1, r3
 
-    pxor	xmm0,	xmm0
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm6,	xmm0
-    punpckhwd	xmm7,	xmm0
+    pxor    xmm0,   xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm6,   xmm0
+    punpckhwd   xmm7,   xmm0
 
-    movdqa	xmm5,	[r4]
-    movdqa	xmm4,	xmm5
-    punpcklwd	xmm4,	xmm0
-    punpckhwd	xmm5,	xmm0
+    movdqa  xmm5,   [r4]
+    movdqa  xmm4,   xmm5
+    punpcklwd   xmm4,   xmm0
+    punpckhwd   xmm5,   xmm0
 
-    paddd	xmm4,	xmm6
-    paddd	xmm5,	xmm7
-    movdqa	xmm3,	xmm4
-    pminud	xmm3,	xmm5
-    pshufd	xmm2,	xmm3,	01001110B
-    pminud	xmm2,	xmm3
-    pshufd	xmm3,	xmm2,	10110001B
-    pminud	xmm2,	xmm3
-    movd	retrd,	xmm2
-    pcmpeqd	xmm4,	xmm2
-    movmskps	r2d, xmm4
-    bsf		r1d,	r2d
-    jnz	near WRITE_INDEX
+    paddd   xmm4,   xmm6
+    paddd   xmm5,   xmm7
+    movdqa  xmm3,   xmm4
+    pminud  xmm3,   xmm5
+    pshufd  xmm2,   xmm3,   01001110B
+    pminud  xmm2,   xmm3
+    pshufd  xmm3,   xmm2,   10110001B
+    pminud  xmm2,   xmm3
+    movd    retrd,  xmm2
+    pcmpeqd xmm4,   xmm2
+    movmskps    r2d, xmm4
+    bsf     r1d,    r2d
+    jnz near WRITE_INDEX
 
-    pcmpeqd	xmm5,	xmm2
-    movmskps	r2d, xmm5
-    bsf		r1d,	r2d
-    add		r1d,	4
+    pcmpeqd xmm5,   xmm2
+    movmskps    r2d, xmm5
+    bsf     r1d,    r2d
+    add     r1d,    4
 
 WRITE_INDEX:
-    mov		[r5],	r1d
+    mov     [r5],   r1d
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -158,66 +158,66 @@
 
 ;**********************************************************************************************************************************
 ;
-;	uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+;   uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
 ;
-;	\note:
-;		src and ref is optional to align with 16 due inter 8x8
-;	\return value:
-;		return minimal SAD cost, according index carried by index_min_cost
+;   \note:
+;       src and ref is optional to align with 16 due inter 8x8
+;   \return value:
+;       return minimal SAD cost, according index carried by index_min_cost
 ;
 ;**********************************************************************************************************************************
 ; try 8 mv via offset
 ; xmm7 store sad costs
-%macro SAD_8x8_LINE_SSE41  4	; src, ref, stride_src, stride_ref
-    movdqu		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqa		xmm2, xmm1
+%macro SAD_8x8_LINE_SSE41  4    ; src, ref, stride_src, stride_ref
+    movdqu      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqa      xmm2, xmm1
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 5	; 101 B
-    paddw		xmm7, xmm2		; accumulate cost
+    mpsadbw     xmm2, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm2      ; accumulate cost
 
-    add			%1, %3
-    add			%2, %4
-%endmacro	; end of SAD_8x8_LINE_SSE41
-%macro SAD_8x8_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
-    movdqu		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqa		xmm2, xmm1
+    add         %1, %3
+    add         %2, %4
+%endmacro   ; end of SAD_8x8_LINE_SSE41
+%macro SAD_8x8_LINE_SSE41E  4   ; src, ref, stride_src, stride_ref
+    movdqu      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqa      xmm2, xmm1
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 5	; 101 B
-    paddw		xmm7, xmm2		; accumulate cost
-%endmacro	; end of SAD_8x8_LINE_SSE41E
+    mpsadbw     xmm2, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm2      ; accumulate cost
+%endmacro   ; end of SAD_8x8_LINE_SSE41E
 
 WELS_EXTERN SampleSad8x8Hor8_sse41
     %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-    SIGN_EXTENSION	r1, r1d
-    SIGN_EXTENSION	r3, r3d
-    movdqa xmm7, [r4]	;	load base cost list
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    movdqa xmm7, [r4]   ;   load base cost list
 
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
 
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41E	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41E r0, r2, r1, r3
 
-    phminposuw	xmm0, xmm7	; horizon search the minimal sad cost and its index
-    movd	retrd, xmm0	; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
-    mov		r1d, retrd
-    and		retrd, 0xFFFF
-    sar		r1d, 16
-    mov		[r5], r1d
+    phminposuw  xmm0, xmm7  ; horizon search the minimal sad cost and its index
+    movd    retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+    mov     r1d, retrd
+    and     retrd, 0xFFFF
+    sar     r1d, 16
+    mov     [r5], r1d
 
     POP_XMM
     LOAD_6_PARA_POP
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -104,32 +104,32 @@
 
 align 16
 high_mask_table:
-	db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
-	db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
-	db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
-	db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
-	db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
-	db  5, 8, 5, 7, 8,11, 6, 8, 8,11
-	db  9,11,12,15, 0, 1, 1, 4, 1, 3
-	db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
-	db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
-	db  7,10, 8,10,11,14, 3, 4, 4, 7
-	db  5, 7, 8,11, 5, 7, 7,10, 8,10
-	db 11,14, 6, 7, 8,11, 8,10,11,14
-	db  9,11,11,14,12,14,15,18, 0, 0
-	db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
-	db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
-	db  7,10, 5, 7, 7,10, 8,10,11,14
-	db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
-	db  6, 9, 7, 9,10,13, 5, 6, 7,10
-	db  7, 9,10,13, 8,10,10,13,11,13
-	db 14,17, 3, 4, 4, 7, 4, 6, 7,10
-	db  5, 7, 7,10, 8,10,11,14, 5, 6
-	db  7,10, 7, 9,10,13, 8,10,10,13
-	db 11,13,14,17, 6, 7, 7,10, 8,10
-	db 11,14, 8,10,10,13,11,13,14,17
-	db  9,10,11,14,11,13,14,17,12,14
-	db 14,17,15,17,18,21
+    db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+    db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+    db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+    db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
+    db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
+    db  5, 8, 5, 7, 8,11, 6, 8, 8,11
+    db  9,11,12,15, 0, 1, 1, 4, 1, 3
+    db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
+    db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
+    db  7,10, 8,10,11,14, 3, 4, 4, 7
+    db  5, 7, 8,11, 5, 7, 7,10, 8,10
+    db 11,14, 6, 7, 8,11, 8,10,11,14
+    db  9,11,11,14,12,14,15,18, 0, 0
+    db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+    db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
+    db  7,10, 5, 7, 7,10, 8,10,11,14
+    db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
+    db  6, 9, 7, 9,10,13, 5, 6, 7,10
+    db  7, 9,10,13, 8,10,10,13,11,13
+    db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+    db  5, 7, 7,10, 8,10,11,14, 5, 6
+    db  7,10, 7, 9,10,13, 8,10,10,13
+    db 11,13,14,17, 6, 7, 7,10, 8,10
+    db 11,14, 8,10,10,13,11,13,14,17
+    db  9,10,11,14,11,13,14,17,12,14
+    db 14,17,15,17,18,21
 
 align 16
 low_mask_table:
@@ -167,78 +167,78 @@
 ;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4DcAc_sse2
-	%ifdef X86_32
-	push r3
-	%assign push_num 1
-	%else
-	%assign push_num 0
-	%endif
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]			; 7 6 5 4 3 2 1 0
-	movdqa     xmm1, [r1+16]		; f e d c b a 9 8
-	pextrw     r2d, xmm0, 7			; ecx = 7
-	pextrw     r3d, xmm1, 2			; edx = a
-	pextrw     r1d, xmm0, 5			; eax = 5
-	pinsrw     xmm1, r2d, 2			; f e d c b 7 9 8
-	pinsrw     xmm0, r1d, 7			; 5 6 5 4 3 2 1 0
-	pextrw     r2d, xmm1, 0			; ecx = 8
-	pinsrw     xmm0, r2d, 5			; 5 6 8 4 3 2 1 0
-	pinsrw     xmm1, r3d, 0			; f e d c b 7 9 a
-	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
-	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
-	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
-	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
-	movdqa     [r0],xmm0
-	movdqa     [r0+16], xmm1
-	%ifdef X86_32
-	pop r3
-	%endif
-	ret
+    %ifdef X86_32
+    push r3
+    %assign push_num 1
+    %else
+    %assign push_num 0
+    %endif
+    LOAD_2_PARA
+    movdqa     xmm0, [r1]           ; 7 6 5 4 3 2 1 0
+    movdqa     xmm1, [r1+16]        ; f e d c b a 9 8
+    pextrw     r2d, xmm0, 7         ; ecx = 7
+    pextrw     r3d, xmm1, 2         ; edx = a
+    pextrw     r1d, xmm0, 5         ; eax = 5
+    pinsrw     xmm1, r2d, 2         ; f e d c b 7 9 8
+    pinsrw     xmm0, r1d, 7         ; 5 6 5 4 3 2 1 0
+    pextrw     r2d, xmm1, 0         ; ecx = 8
+    pinsrw     xmm0, r2d, 5         ; 5 6 8 4 3 2 1 0
+    pinsrw     xmm1, r3d, 0         ; f e d c b 7 9 a
+    pshufd     xmm2, xmm0, 0xd8     ; 5 6 3 2 8 4 1 0
+    pshufd     xmm3, xmm1, 0xd8     ; f e b 7 d c 9 a
+    pshufhw    xmm0, xmm2, 0x93     ; 6 3 2 5 8 4 1 0
+    pshuflw    xmm1, xmm3, 0x39     ; f e b 7 a d c 9
+    movdqa     [r0],xmm0
+    movdqa     [r0+16], xmm1
+    %ifdef X86_32
+    pop r3
+    %endif
+    ret
 
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4DcAc_ssse3
-	%assign push_num 0
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]
-	movdqa     xmm1, [r1+16]
-	pextrw		r2d,  xmm0, 7			; ecx = [7]
-	pextrw		r1d,  xmm1, 0			; eax = [8]
-	pinsrw		xmm0, r1d, 7			; xmm0[7]	=	[8]
-	pinsrw		xmm1, r2d, 0			; xmm1[0]	=	[7]
-	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]
+    %assign push_num 0
+    LOAD_2_PARA
+    movdqa     xmm0, [r1]
+    movdqa     xmm1, [r1+16]
+    pextrw      r2d,  xmm0, 7           ; ecx = [7]
+    pextrw      r1d,  xmm1, 0           ; eax = [8]
+    pinsrw      xmm0, r1d, 7            ; xmm0[7]   =   [8]
+    pinsrw      xmm1, r2d, 0            ; xmm1[0]   =   [7]
+    pshufb      xmm1, [pb_scanacdc_maskb]
+    pshufb      xmm0, [pb_scanacdc_maska]
 
-	movdqa     [r0],xmm0
-	movdqa     [r0+16], xmm1
-	ret
+    movdqa     [r0],xmm0
+    movdqa     [r0+16], xmm1
+    ret
 ;***********************************************************************
 ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4Ac_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]
-	movdqa     xmm1, [r1+16]
-	movdqa     xmm2, xmm0
-	punpcklqdq xmm0, xmm1
-	punpckhqdq xmm2, xmm1
+    %assign push_num 0
+    LOAD_2_PARA
+    movdqa     xmm0, [r1]
+    movdqa     xmm1, [r1+16]
+    movdqa     xmm2, xmm0
+    punpcklqdq xmm0, xmm1
+    punpckhqdq xmm2, xmm1
 
-	movdqa     xmm3, xmm0
-	punpckldq  xmm0, xmm2
-	punpckhdq  xmm3, xmm2
-	pextrw     r1d , xmm0, 3
-	pextrw     r2d , xmm0, 7
-	pinsrw     xmm0, r1d,  7
-	pextrw     r1d,  xmm3, 4
-	pinsrw     xmm3, r2d,  4
-	pextrw     r2d,  xmm3, 0
-	pinsrw     xmm3, r1d,  0
-	pinsrw     xmm0, r2d,  3
+    movdqa     xmm3, xmm0
+    punpckldq  xmm0, xmm2
+    punpckhdq  xmm3, xmm2
+    pextrw     r1d , xmm0, 3
+    pextrw     r2d , xmm0, 7
+    pinsrw     xmm0, r1d,  7
+    pextrw     r1d,  xmm3, 4
+    pinsrw     xmm3, r2d,  4
+    pextrw     r2d,  xmm3, 0
+    pinsrw     xmm3, r1d,  0
+    pinsrw     xmm0, r2d,  3
 
-	pshufhw    xmm1, xmm0, 0x93
-	pshuflw    xmm2, xmm3, 0x39
+    pshufhw    xmm1, xmm0, 0x93
+    pshuflw    xmm2, xmm3, 0x39
 
     movdqa     xmm3, xmm2
     psrldq     xmm1, 2
@@ -245,9 +245,9 @@
     pslldq     xmm3, 14
     por        xmm1, xmm3
     psrldq     xmm2, 2
-	movdqa     [r0],xmm1
-	movdqa     [r0+16], xmm2
-	ret
+    movdqa     [r0],xmm1
+    movdqa     [r0+16], xmm2
+    ret
 
 
 ;***********************************************************************
@@ -254,19 +254,19 @@
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
 WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
-	%ifdef X86_32
-	push r3
-	%assign push_num 1
-	%else
-	%assign push_num 0
-	%endif
-	LOAD_1_PARA
-	movdqa    xmm0, [r0]
-	movdqa    xmm1, [r0+16]
+    %ifdef X86_32
+    push r3
+    %assign push_num 1
+    %else
+    %assign push_num 0
+    %endif
+    LOAD_1_PARA
+    movdqa    xmm0, [r0]
+    movdqa    xmm1, [r0+16]
 
-	packsswb  xmm0, xmm1
-	; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
-	xor r3, r3
+    packsswb  xmm0, xmm1
+    ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+    xor r3, r3
     pxor      xmm3, xmm3
     pcmpeqb   xmm0, xmm3
     pmovmskb  r3d,  xmm0
@@ -273,39 +273,39 @@
 
     xor       r3,  0xffff
 
-	xor       r0,  r0
-	mov       r2,  7
-	mov       r1,  8
+    xor       r0,  r0
+    mov       r2,  7
+    mov       r1,  8
 .loop_low8_find1:
-	bt        r3,  r2
-	jc        .loop_high8_find1
-	dec		  r2
-	jnz      .loop_low8_find1
+    bt        r3,  r2
+    jc        .loop_high8_find1
+    dec       r2
+    jnz      .loop_low8_find1
 .loop_high8_find1:
-	bt        r3, r1
-	jc        .find1end
-	inc       r1
-	cmp       r1,16
-	jb        .loop_high8_find1
+    bt        r3, r1
+    jc        .find1end
+    inc       r1
+    cmp       r1,16
+    jb        .loop_high8_find1
 .find1end:
-	sub       r1, r2
-	sub       r1, 1
-	lea	  r2,  [i_ds_table]
-	add       r0b,  [r2+r1]
-	mov       r1, r3
-	and       r3, 0xff
-	shr       r1, 8
-	and       r1, 0xff
-	lea	  r2 , [low_mask_table]
-	add       r0b,  [r2 +r3]
-	lea	  r2, [high_mask_table]
-	add       r0b,  [r2+r1]
-	%ifdef X86_32
-	pop r3
-	%else
-	mov retrd, r0d
-	%endif
-	ret
+    sub       r1, r2
+    sub       r1, 1
+    lea   r2,  [i_ds_table]
+    add       r0b,  [r2+r1]
+    mov       r1, r3
+    and       r3, 0xff
+    shr       r1, 8
+    and       r1, 0xff
+    lea   r2 , [low_mask_table]
+    add       r0b,  [r2 +r3]
+    lea   r2, [high_mask_table]
+    add       r0b,  [r2+r1]
+    %ifdef X86_32
+    pop r3
+    %else
+    mov retrd, r0d
+    %endif
+    ret
 
 
 ;***********************************************************************
@@ -312,28 +312,28 @@
 ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
 ;***********************************************************************
 WELS_EXTERN WelsGetNoneZeroCount_sse2
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa    xmm0, [r0]
-	movdqa    xmm1, [r0+16]
-	pxor      xmm2, xmm2
-	pcmpeqw   xmm0, xmm2
-	pcmpeqw   xmm1, xmm2
-	packsswb  xmm1, xmm0
-	xor r1, r1
-	pmovmskb  r1d,  xmm1
-	xor       r1d,  0xffff
-	mov       r2,  r1
-	and       r1,  0xff
-	shr       r2,  8
-;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-;	xor       retr,  retr
-	;add       al,  [nozero_count_table+r2]
-	lea 	  r0 , [nozero_count_table]
-	movzx	  r2, byte [r0+r2]
-	movzx	  r1,   byte [r0+r1]
-	mov	  retrq, r2
-	add	  retrq, r1
-	;add       al,  [nozero_count_table+r1]
-	ret
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa    xmm0, [r0]
+    movdqa    xmm1, [r0+16]
+    pxor      xmm2, xmm2
+    pcmpeqw   xmm0, xmm2
+    pcmpeqw   xmm1, xmm2
+    packsswb  xmm1, xmm0
+    xor r1, r1
+    pmovmskb  r1d,  xmm1
+    xor       r1d,  0xffff
+    mov       r2,  r1
+    and       r1,  0xff
+    shr       r2,  8
+;   and       ecx,  0xff    ; we do not need this due to high 16bits equal to 0 yet
+;   xor       retr,  retr
+    ;add       al,  [nozero_count_table+r2]
+    lea       r0 , [nozero_count_table]
+    movzx     r2, byte [r0+r2]
+    movzx     r1,   byte [r0+r1]
+    mov   retrq, r2
+    add   retrq, r1
+    ;add       al,  [nozero_count_table+r1]
+    ret
 
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -36,17 +36,17 @@
 
 #ifdef __APPLE__
 .macro SQR_ADD_16BYTES
-	vmull.u8 q3, $0, $0
-	vmull.u8 q8, $1, $1
-	vpadal.u16 $2, q3
-	vpadal.u16 $2, q8
+    vmull.u8 q3, $0, $0
+    vmull.u8 q8, $1, $1
+    vpadal.u16 $2, q3
+    vpadal.u16 $2, q8
 .endm
 #else
 .macro SQR_ADD_16BYTES arg0, arg1, arg2
-	vmull.u8 q3, \arg0, \arg0
-	vmull.u8 q8, \arg1, \arg1
-	vpadal.u16 \arg2, q3
-	vpadal.u16 \arg2, q8
+    vmull.u8 q3, \arg0, \arg0
+    vmull.u8 q8, \arg1, \arg1
+    vpadal.u16 \arg2, q3
+    vpadal.u16 \arg2, q8
 .endm
 #endif
 
@@ -54,66 +54,66 @@
 WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
     stmdb sp!, {r4}
 
-	vld1.8   {q15}, [r0], r1 //save the ref data (16bytes)
-	vld1.8   {q14}, [r2], r3 //save the src data (16bytes)
+    vld1.8   {q15}, [r0], r1 //save the ref data (16bytes)
+    vld1.8   {q14}, [r2], r3 //save the src data (16bytes)
 
 
-	vabd.u8  q13, q14, q15
-	vmull.u8 q12, d27, d27
-	vmull.u8 q11, d26, d26
-	vaddl.u16 q12, d24, d25
-	vpadal.u16 q12, q11     //sqr
+    vabd.u8  q13, q14, q15
+    vmull.u8 q12, d27, d27
+    vmull.u8 q11, d26, d26
+    vaddl.u16 q12, d24, d25
+    vpadal.u16 q12, q11     //sqr
 
     vaddl.u8 q13, d26, d27 //sum
 
-	vaddl.u8 q10, d28, d29 //sum_cur
+    vaddl.u8 q10, d28, d29 //sum_cur
 
-	vmull.u8 q9,  d29, d29
-	vmull.u8 q8,  d28, d28
-	vaddl.u16 q9, d18, d19       //sqr_cur
-	vpadal.u16 q9, q8
+    vmull.u8 q9,  d29, d29
+    vmull.u8 q8,  d28, d28
+    vaddl.u16 q9, d18, d19       //sqr_cur
+    vpadal.u16 q9, q8
 
-	mov r4, #15
+    mov r4, #15
 pixel_var_16x16_loop0:
 
-	vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
-	vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
+    vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
+    vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
 
-	vabd.u8 q2, q0, q1
+    vabd.u8 q2, q0, q1
 
-	//q10 save sum_cur
-	vpadal.u8 q10, q1
+    //q10 save sum_cur
+    vpadal.u8 q10, q1
 
-	//q12 save sqr
-	SQR_ADD_16BYTES d4, d5, q12
+    //q12 save sqr
+    SQR_ADD_16BYTES d4, d5, q12
 
     //q13 save sum
-	vpadal.u8 q13, q2
+    vpadal.u8 q13, q2
 
-	subs r4, #1
+    subs r4, #1
 
-	//q9 save sqr_cur
-	SQR_ADD_16BYTES d2, d3, q9
+    //q9 save sqr_cur
+    SQR_ADD_16BYTES d2, d3, q9
 
-	bne pixel_var_16x16_loop0
+    bne pixel_var_16x16_loop0
 
-	vadd.u16 d0, d26, d27 //sum
-	vadd.u16 d1, d20, d21 //sum_cur
-	vpaddl.u16 q0, q0
-	vadd.u32 d2, d24, d25 //sqr
-	vadd.u32 d3, d18, d19 //sqr_cur
-	vpadd.u32 d0, d0, d1
-	vpadd.u32 d1, d2, d3
+    vadd.u16 d0, d26, d27 //sum
+    vadd.u16 d1, d20, d21 //sum_cur
+    vpaddl.u16 q0, q0
+    vadd.u32 d2, d24, d25 //sqr
+    vadd.u32 d3, d18, d19 //sqr_cur
+    vpadd.u32 d0, d0, d1
+    vpadd.u32 d1, d2, d3
 
-	ldr       r4, [sp, #4]
+    ldr       r4, [sp, #4]
 
-	vshr.u32  q0, q0, #8
-	vmul.u32  d0, d0
-	vsub.u32  d0, d1, d0
+    vshr.u32  q0, q0, #8
+    vmul.u32  d0, d0
+    vsub.u32  d0, d1, d0
     vmovl.u32 q0, d0
-	vst2.16  {d0[0], d1[0]}, [r4]
+    vst2.16  {d0[0], d1[0]}, [r4]
 
-	ldmia sp!, {r4}
+    ldmia sp!, {r4}
 
 WELS_ASM_FUNC_END
 
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -30,196 +30,196 @@
  *
  */
 
-#ifdef	HAVE_NEON
+#ifdef  HAVE_NEON
 .text
 #include "arm_arch_common_macro.S"
 
 
-WELS_ASM_FUNC_BEGIN	DyadicBilinearDownsampler_neon
-	stmdb	sp!, {r4-r8, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
+    stmdb   sp!, {r4-r8, lr}
 
-	//Get	the	width	and	height
-	ldr	 r4, [sp,	#24]	//src_width
-	ldr	 r5, [sp,	#28]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #24]    //src_width
+    ldr  r5, [sp,   #28]    //src_height
 
-	//Initialize the register
-	mov	r6,	r2
-	mov	r8,	r0
-	mov	lr,	#0
-	lsr	r5,	#1
+    //Initialize the register
+    mov r6, r2
+    mov r8, r0
+    mov lr, #0
+    lsr r5, #1
 
-	//Save the tailer	for	the	unasigned	size
-	mla	 r7, r1, r5, r0
-	vld1.32	{q15}, [r7]
+    //Save the tailer   for the unasigned   size
+    mla  r7, r1, r5, r0
+    vld1.32 {q15}, [r7]
 
-	add	r7,	r2,	r3
-	//processing a colume	data
+    add r7, r2, r3
+    //processing a colume   data
 comp_ds_bilinear_loop0:
 
-	vld1.8 {q0,q1},	[r2]!
-	vld1.8 {q2,q3},	[r7]!
-	vpaddl.u8	q0,	q0
-	vpaddl.u8	q1,	q1
-	vpaddl.u8	q2,	q2
-	vpaddl.u8	q3,	q3
-	vrshr.u16	q0,	#1
-	vrshr.u16	q1,	#1
-	vrshr.u16	q2,	#1
-	vrshr.u16	q3,	#1
-	vrhadd.u16 q0, q2
-	vrhadd.u16 q1, q3
-	vmovn.u16	d0,	q0
-	vmovn.u16	d1,	q1
-	vst1.32	{q0},	[r0]!
-	add	lr,	#32
+    vld1.8 {q0,q1}, [r2]!
+    vld1.8 {q2,q3}, [r7]!
+    vpaddl.u8   q0, q0
+    vpaddl.u8   q1, q1
+    vpaddl.u8   q2, q2
+    vpaddl.u8   q3, q3
+    vrshr.u16   q0, #1
+    vrshr.u16   q1, #1
+    vrshr.u16   q2, #1
+    vrshr.u16   q3, #1
+    vrhadd.u16 q0, q2
+    vrhadd.u16 q1, q3
+    vmovn.u16   d0, q0
+    vmovn.u16   d1, q1
+    vst1.32 {q0},   [r0]!
+    add lr, #32
 
-	cmp	lr,	r4
-	movcs	lr,	#0
-	addcs	r6,	r6,	r3,	lsl	#1
-	movcs	r2,	r6
-	addcs	r7,	r2,	r3
-	addcs	r8,	r1
-	movcs	r0,	r8
-	subscs r5, #1
-	bne	comp_ds_bilinear_loop0
+    cmp lr, r4
+    movcs   lr, #0
+    addcs   r6, r6, r3, lsl #1
+    movcs   r2, r6
+    addcs   r7, r2, r3
+    addcs   r8, r1
+    movcs   r0, r8
+    subscs r5, #1
+    bne comp_ds_bilinear_loop0
 
-	//restore	the	tailer for the unasigned size
-	vst1.32	{q15}, [r0]
+    //restore   the tailer for the unasigned size
+    vst1.32 {q15}, [r0]
 
-	ldmia	sp!, {r4-r8,lr}
+    ldmia   sp!, {r4-r8,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	comp_ds_bilinear_w_x8_neon
-    stmdb	sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
+    stmdb   sp!, {r4-r7, lr}
 
-    //Get	the	width	and	height
-	ldr	 r4, [sp,	#20]	//src_width
-	ldr	 r5, [sp,	#24]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #20]    //src_width
+    ldr  r5, [sp,   #24]    //src_height
 
-	//Get	the	difference
-	sub	lr,	r3,	r4
-	sub	r1,	r1,	r4,	lsr	#1
+    //Get   the difference
+    sub lr, r3, r4
+    sub r1, r1, r4, lsr #1
 
-	lsr	r5,	#1
+    lsr r5, #1
 
-	//processing a colume	data
+    //processing a colume   data
 comp_ds_bilinear_w_x8_loop0:
 
-	lsr	r6,	r4,	#3
-	add	r7,	r2,	r3
-	//processing a line	data
+    lsr r6, r4, #3
+    add r7, r2, r3
+    //processing a line data
 comp_ds_bilinear_w_x8_loop1:
 
-	vld1.8 {d0}, [r2]!
-	vld1.8 {d1}, [r7]!
-	vpaddl.u8	q0,	q0
-	vrshr.u16	q0,	#1
-	vrhadd.u16 d0, d1
+    vld1.8 {d0}, [r2]!
+    vld1.8 {d1}, [r7]!
+    vpaddl.u8   q0, q0
+    vrshr.u16   q0, #1
+    vrhadd.u16 d0, d1
 
-	vmovn.u16	d0,	q0
-	vst1.32	{d0[0]}, [r0]!
-	subs r6, #1
-	bne	comp_ds_bilinear_w_x8_loop1
+    vmovn.u16   d0, q0
+    vst1.32 {d0[0]}, [r0]!
+    subs r6, #1
+    bne comp_ds_bilinear_w_x8_loop1
 
-	add	r2,	r7,	lr
-	add	r0,	r1
-	subs r5, #1
-	bne	comp_ds_bilinear_w_x8_loop0
+    add r2, r7, lr
+    add r0, r1
+    subs r5, #1
+    bne comp_ds_bilinear_w_x8_loop0
 
-    ldmia	sp!, {r4-r7,lr}
+    ldmia   sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	comp_ds_bilinear_w_x16_neon
-    stmdb	sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
+    stmdb   sp!, {r4-r7, lr}
 
-    //Get	the	width	and	height
-	ldr	 r4, [sp,	#20]	//src_width
-	ldr	 r5, [sp,	#24]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #20]    //src_width
+    ldr  r5, [sp,   #24]    //src_height
 
-	//Get	the	difference
-	sub	lr,	r3,	r4
-	sub	r1,	r1,	r4,	lsr	#1
+    //Get   the difference
+    sub lr, r3, r4
+    sub r1, r1, r4, lsr #1
 
-	lsr	r5,	#1
+    lsr r5, #1
 
-	//processing a colume	data
+    //processing a colume   data
 comp_ds_bilinear_w_x16_loop0:
 
-	lsr	r6,	r4,	#4
-	add	r7,	r2,	r3
-	//processing a line	data
+    lsr r6, r4, #4
+    add r7, r2, r3
+    //processing a line data
 comp_ds_bilinear_w_x16_loop1:
 
-	vld1.8 {q0}, [r2]!
-	vld1.8 {q1}, [r7]!
-	vpaddl.u8	q0,	q0
-	vpaddl.u8	q1,	q1
-	vrshr.u16	q0,	#1
-	vrshr.u16	q1,	#1
-	vrhadd.u16 q0, q1
+    vld1.8 {q0}, [r2]!
+    vld1.8 {q1}, [r7]!
+    vpaddl.u8   q0, q0
+    vpaddl.u8   q1, q1
+    vrshr.u16   q0, #1
+    vrshr.u16   q1, #1
+    vrhadd.u16 q0, q1
 
-	vmovn.u16	d0,	q0
-	vst1.32	{d0},	[r0]!
-	subs r6, #1
-	bne	comp_ds_bilinear_w_x16_loop1
+    vmovn.u16   d0, q0
+    vst1.32 {d0},   [r0]!
+    subs r6, #1
+    bne comp_ds_bilinear_w_x16_loop1
 
-	add	r2,	r7,	lr
-	add	r0,	r1
-	subs r5, #1
-	bne	comp_ds_bilinear_w_x16_loop0
+    add r2, r7, lr
+    add r0, r1
+    subs r5, #1
+    bne comp_ds_bilinear_w_x16_loop0
 
-	ldmia	sp!, {r4-r7,lr}
+    ldmia   sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	DyadicBilinearDownsamplerWidthx32_neon
-	stmdb	sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
+    stmdb   sp!, {r4-r7, lr}
 
-	//Get	the	width	and	height
-	ldr	 r4, [sp,	#20]	//src_width
-	ldr	 r5, [sp,	#24]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #20]    //src_width
+    ldr  r5, [sp,   #24]    //src_height
 
-	//Get	the	difference
-	sub	lr,	r3,	r4
-	sub	r1,	r1,	r4,	lsr	#1
+    //Get   the difference
+    sub lr, r3, r4
+    sub r1, r1, r4, lsr #1
 
-	lsr	r5,	#1
+    lsr r5, #1
 
-	//processing a colume	data
+    //processing a colume   data
 comp_ds_bilinear_w_x32_loop0:
 
-	lsr	r6,	r4,	#5
-	add	r7,	r2,	r3
-	//processing a line	data
+    lsr r6, r4, #5
+    add r7, r2, r3
+    //processing a line data
 comp_ds_bilinear_w_x32_loop1:
 
-	vld1.8 {q0,q1},	[r2]!
-	vld1.8 {q2,q3},	[r7]!
-	vpaddl.u8	q0,	q0
-	vpaddl.u8	q1,	q1
-	vpaddl.u8	q2,	q2
-	vpaddl.u8	q3,	q3
-	vrshr.u16	q0,	#1
-	vrshr.u16	q1,	#1
-	vrshr.u16	q2,	#1
-	vrshr.u16	q3,	#1
-	vrhadd.u16 q0, q2
-	vrhadd.u16 q1, q3
+    vld1.8 {q0,q1}, [r2]!
+    vld1.8 {q2,q3}, [r7]!
+    vpaddl.u8   q0, q0
+    vpaddl.u8   q1, q1
+    vpaddl.u8   q2, q2
+    vpaddl.u8   q3, q3
+    vrshr.u16   q0, #1
+    vrshr.u16   q1, #1
+    vrshr.u16   q2, #1
+    vrshr.u16   q3, #1
+    vrhadd.u16 q0, q2
+    vrhadd.u16 q1, q3
 
-	vmovn.u16	d0,	q0
-	vmovn.u16	d1,	q1
-	vst1.32	{q0},	[r0]!
-	subs r6, #1
-	bne	comp_ds_bilinear_w_x32_loop1
+    vmovn.u16   d0, q0
+    vmovn.u16   d1, q1
+    vst1.32 {q0},   [r0]!
+    subs r6, #1
+    bne comp_ds_bilinear_w_x32_loop1
 
-	add	r2,	r7,	lr
-	add	r0,	r1
-	subs r5, #1
-	bne	comp_ds_bilinear_w_x32_loop0
+    add r2, r7, lr
+    add r0, r1
+    subs r5, #1
+    bne comp_ds_bilinear_w_x32_loop0
 
-	ldmia	sp!, {r4-r7,lr}
+    ldmia   sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
@@ -226,117 +226,117 @@
 WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
     stmdb sp!, {r4-r12, lr}
 
-	//Get the data from stack
-	ldr r4, [sp, #40] //the addr of src
-	ldr r5, [sp, #44] //the value of src_stride
+    //Get the data from stack
+    ldr r4, [sp, #40] //the addr of src
+    ldr r5, [sp, #44] //the value of src_stride
     ldr r6, [sp, #48] //the value of scaleX
     ldr r7, [sp, #52] //the value of scaleY
 
     mov     r10, #32768
     sub     r10, #1
-    and		r8, r6, r10			// r8 uinc(scaleX mod 32767)
+    and     r8, r6, r10         // r8 uinc(scaleX mod 32767)
     mov     r11, #-1
-	mul		r11, r8			// r11 -uinc
+    mul     r11, r8         // r11 -uinc
 
     vdup.s16 d2, r8
     vdup.s16 d0, r11
     vzip.s16 d0, d2         // uinc -uinc uinc -uinc
 
-	and		r9, r7, r10			// r9 vinc(scaleY mod 32767)
+    and     r9, r7, r10         // r9 vinc(scaleY mod 32767)
     mov     r11, #-1
-	mul		r11, r9			// r11 -vinc
+    mul     r11, r9         // r11 -vinc
 
-	vdup.s16 d2, r9
-	vdup.s16 d3, r11
-	vext.8   d5, d3, d2, #4		// vinc vinc -vinc -vinc
+    vdup.s16 d2, r9
+    vdup.s16 d3, r11
+    vext.8   d5, d3, d2, #4     // vinc vinc -vinc -vinc
 
-    mov		 r11, #0x40000000
+    mov      r11, #0x40000000
     mov      r12, #0x4000
     sub      r12, #1
     add      r11, r12
-	vdup.s32 d1, r11;			//init u  16384 16383 16384 16383
+    vdup.s32 d1, r11;           //init u  16384 16383 16384 16383
 
-	mov		 r11, #16384
+    mov      r11, #16384
     vdup.s16 d16, r11
     sub      r11, #1
-	vdup.s16 d17, r11
-	vext.8	 d7, d17, d16, #4		//init v  16384 16384 16383 16383
+    vdup.s16 d17, r11
+    vext.8   d7, d17, d16, #4       //init v  16384 16384 16383 16383
 
-	veor    q14,     q14
-	sub		r1,		r2			// stride - width
-	mov		r8,		#16384		// yInverse
-	sub		r3,		#1
+    veor    q14,     q14
+    sub     r1,     r2          // stride - width
+    mov     r8,     #16384      // yInverse
+    sub     r3,     #1
 
 _HEIGHT:
     ldr     r4, [sp, #40]           //the addr of src
-    mov		r11,	r8
-    lsr		r11,	#15
-	mul		r11,	r5
-	add		r11,	r4					// get current row address
-	mov		r12,	r11
-	add		r12,	r5
+    mov     r11,    r8
+    lsr     r11,    #15
+    mul     r11,    r5
+    add     r11,    r4                  // get current row address
+    mov     r12,    r11
+    add     r12,    r5
 
-	mov		r9,		#16384				// xInverse
-	sub		r10, r2, #1
+    mov     r9,     #16384              // xInverse
+    sub     r10, r2, #1
     vmov.s16 d6, d1
 
 _WIDTH:
-	mov		lr,		r9
-    lsr		lr,		#15
+    mov     lr,     r9
+    lsr     lr,     #15
     add     r4,     r11,lr
-	vld2.8	{d28[0],d29[0]},	[r4]		//q14: 0000000b0000000a;
+    vld2.8  {d28[0],d29[0]},    [r4]        //q14: 0000000b0000000a;
     add     r4,     r12,lr
-	vld2.8	{d28[4],d29[4]},	[r4]		//q14: 000d000b000c000a;
-	vzip.32		d28, d29					//q14: 000d000c000b000a;
+    vld2.8  {d28[4],d29[4]},    [r4]        //q14: 000d000b000c000a;
+    vzip.32     d28, d29                    //q14: 000d000c000b000a;
 
-	vmull.u16	q13, d6, d7			//q13: init u  *  init  v
-	vmull.u32	q12, d26,d28
-	vmlal.u32	q12, d27,d29
-	vqadd.u64	d24, d24,d25
-	vrshr.u64	d24, #30
+    vmull.u16   q13, d6, d7         //q13: init u  *  init  v
+    vmull.u32   q12, d26,d28
+    vmlal.u32   q12, d27,d29
+    vqadd.u64   d24, d24,d25
+    vrshr.u64   d24, #30
 
-	vst1.8	{d24[0]},	[r0]!
-	add		r9,	r6
-	vadd.u16	d6, d0				// inc u
-	vshl.u16	d6, #1
-	vshr.u16	d6, #1
-	subs	r10, #1
-	bne		_WIDTH
+    vst1.8  {d24[0]},   [r0]!
+    add     r9, r6
+    vadd.u16    d6, d0              // inc u
+    vshl.u16    d6, #1
+    vshr.u16    d6, #1
+    subs    r10, #1
+    bne     _WIDTH
 
 WIDTH_END:
-    lsr		r9,		#15
+    lsr     r9,     #15
     add     r4,r11,r9
-	vld1.8	{d24[0]},	[r4]
-	vst1.8	{d24[0]},   [r0]
-	add		r0,		#1
-	add		r8,		r7
-	add		r0,		r1
-	vadd.s16	d7,	d5				// inc v
-	vshl.u16	d7, #1
-	vshr.u16	d7, #1
-	subs	r3,		#1
-	bne		_HEIGHT
+    vld1.8  {d24[0]},   [r4]
+    vst1.8  {d24[0]},   [r0]
+    add     r0,     #1
+    add     r8,     r7
+    add     r0,     r1
+    vadd.s16    d7, d5              // inc v
+    vshl.u16    d7, #1
+    vshr.u16    d7, #1
+    subs    r3,     #1
+    bne     _HEIGHT
 
 LAST_ROW:
     ldr     r4, [sp, #40]           //the addr of src
-    lsr		r8,	#15
-	mul		r8, r5
-	add		r4,	r8					// get current row address
-	mov		r9,		#16384
+    lsr     r8, #15
+    mul     r8, r5
+    add     r4, r8                  // get current row address
+    mov     r9,     #16384
 
 _LAST_ROW_WIDTH:
-	mov		r11,	r9
-    lsr		r11,	#15
+    mov     r11,    r9
+    lsr     r11,    #15
 
-	add     r3,     r4,r11
-	vld1.8	{d0[0]},	[r3]
-	vst1.8	{d0[0]},	[r0]
-	add		r0,		#1
-	add		r9,		r6
-	subs	r2,		#1
-	bne		_LAST_ROW_WIDTH
+    add     r3,     r4,r11
+    vld1.8  {d0[0]},    [r3]
+    vst1.8  {d0[0]},    [r0]
+    add     r0,     #1
+    add     r9,     r6
+    subs    r2,     #1
+    bne     _LAST_ROW_WIDTH
 
-	ldmia sp!, {r4-r12, lr}
+    ldmia sp!, {r4-r12, lr}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -37,32 +37,32 @@
 
 WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
     stmdb sp!, {lr}
-	//Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1
-	vld1.8 {d1}, [r2], r3
+    //Loading a horizontal line data (8 bytes)
+    vld1.8 {d0}, [r0], r1
+    vld1.8 {d1}, [r2], r3
 
-	//Do the SAD for 8 bytes
-	vabdl.u8  q1, d0, d1
+    //Do the SAD for 8 bytes
+    vabdl.u8  q1, d0, d1
 
-	mov lr, #7
+    mov lr, #7
 pixel_sad_8x8_loop0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1
-	vld1.8 {d1}, [r2], r3
+    vld1.8 {d0}, [r0], r1
+    vld1.8 {d1}, [r2], r3
 
-	subs lr, #1
+    subs lr, #1
 
-	//Do the SAD for 8 bytes
-	vabal.u8  q1, d0, d1
-	bne pixel_sad_8x8_loop0
+    //Do the SAD for 8 bytes
+    vabal.u8  q1, d0, d1
+    bne pixel_sad_8x8_loop0
 
-	vadd.u16   d2, d3
-	vpaddl.u16 d2, d2
-	vpaddl.u32 d2, d2
-	vmov.u32   r0, d2[0]//TBO...
+    vadd.u16   d2, d3
+    vpaddl.u16 d2, d2
+    vpaddl.u32 d2, d2
+    vmov.u32   r0, d2[0]//TBO...
 
-	ldmia sp!, {lr}
+    ldmia sp!, {lr}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -37,61 +37,61 @@
 #ifdef __APPLE__
 
 .macro ABS_SUB_SUM_16BYTES
-	vld1.32 {q15}, [$0], $2
-	vld1.32 {q14}, [$1], $2
-	vabal.u8 $3, d30, d28
-	vabal.u8 $4, d31, d29
+    vld1.32 {q15}, [$0], $2
+    vld1.32 {q14}, [$1], $2
+    vabal.u8 $3, d30, d28
+    vabal.u8 $4, d31, d29
 .endm
 
 .macro ABS_SUB_SUM_8x16BYTES
-	vld1.32 {q15}, [$0], $2
-	vld1.32 {q14}, [$1], $2
-	vabdl.u8 $3, d30, d28
-	vabdl.u8 $4, d31, d29
+    vld1.32 {q15}, [$0], $2
+    vld1.32 {q14}, [$1], $2
+    vabdl.u8 $3, d30, d28
+    vabdl.u8 $4, d31, d29
 
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 .endm
 
 .macro SAD_8X16BITS
-	vadd.u16 d31, $0, $1
-	vpaddl.u16 d31, d31
-	vpaddl.u32 $2, d31
+    vadd.u16 d31, $0, $1
+    vpaddl.u16 d31, d31
+    vpaddl.u32 $2, d31
 .endm
 
 #else
 
 .macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
-	vld1.32 {q15}, [\arg0], \arg2
-	vld1.32 {q14}, [\arg1], \arg2
-	vabal.u8 \arg3, d30, d28
-	vabal.u8 \arg4, d31, d29
+    vld1.32 {q15}, [\arg0], \arg2
+    vld1.32 {q14}, [\arg1], \arg2
+    vabal.u8 \arg3, d30, d28
+    vabal.u8 \arg4, d31, d29
 .endm
 
 .macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
-	vld1.32 {q15}, [\arg0], \arg2
-	vld1.32 {q14}, [\arg1], \arg2
-	vabdl.u8 \arg3, d30, d28
-	vabdl.u8 \arg4, d31, d29
+    vld1.32 {q15}, [\arg0], \arg2
+    vld1.32 {q14}, [\arg1], \arg2
+    vabdl.u8 \arg3, d30, d28
+    vabdl.u8 \arg4, d31, d29
 
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 .endm
 
 .macro SAD_8X16BITS arg0, arg1, arg2
-	vadd.u16 d31, \arg0, \arg1
-	vpaddl.u16 d31, d31
-	vpaddl.u32 \arg2, d31
+    vadd.u16 d31, \arg0, \arg1
+    vpaddl.u16 d31, d31
+    vpaddl.u32 \arg2, d31
 .endm
 #endif
 
@@ -100,16 +100,16 @@
 
     stmdb sp!, {r4-r8}
 
-	ldr r4, [sp, #20] //load pic_stride
-	ldr r5, [sp, #28] //load psad8x8
+    ldr r4, [sp, #20] //load pic_stride
+    ldr r5, [sp, #28] //load psad8x8
 
-	//Initial the Q8 register for save the "psadframe"
-	vmov.s64 q8, #0
+    //Initial the Q8 register for save the "psadframe"
+    vmov.s64 q8, #0
 
-	//Get the jump distance to use on loop codes
-	lsl r8, r4, #4
-	sub r7, r8, #16 //R7 keep the 16*pic_stride-16
-	sub r8, r2      //R8 keep the 16*pic_stride-pic_width
+    //Get the jump distance to use on loop codes
+    lsl r8, r4, #4
+    sub r7, r8, #16 //R7 keep the 16*pic_stride-16
+    sub r8, r2      //R8 keep the 16*pic_stride-pic_width
 
 vaa_calc_sad_loop0:
 
@@ -118,44 +118,44 @@
 
 vaa_calc_sad_loop1:
 
-	//Process the 16x16 bytes
-	ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
-	ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
+    //Process the 16x16 bytes
+    ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
+    ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
 
-	//Do the SAD
-	SAD_8X16BITS d0, d1, d0
-	SAD_8X16BITS d2, d3, d1
-	SAD_8X16BITS d4, d5, d2
-	SAD_8X16BITS d6, d7, d3
+    //Do the SAD
+    SAD_8X16BITS d0, d1, d0
+    SAD_8X16BITS d2, d3, d1
+    SAD_8X16BITS d4, d5, d2
+    SAD_8X16BITS d6, d7, d3
 
-	//Write to "psad8x8" buffer
-	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
+    //Write to "psad8x8" buffer
+    vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
 
 
-	//Adjust the input address
-	sub r0, r7
-	sub r1, r7
+    //Adjust the input address
+    sub r0, r7
+    sub r1, r7
 
-	subs r6, #16
+    subs r6, #16
 
-	//Save to calculate "psadframe"
-	vadd.u32 q0, q1
-	vadd.u32 q8, q0
+    //Save to calculate "psadframe"
+    vadd.u32 q0, q1
+    vadd.u32 q8, q0
 
-	bne vaa_calc_sad_loop1
+    bne vaa_calc_sad_loop1
 
-	//Adjust the input address
-	add r0, r8
-	add r1, r8
+    //Adjust the input address
+    add r0, r8
+    add r1, r8
 
     subs r3, #16
-	bne vaa_calc_sad_loop0
+    bne vaa_calc_sad_loop0
 
-	ldr r6, [sp, #24] //load psadframe
-	vadd.u32 d16, d17
-	vst1.32 {d16[0]}, [r6]
+    ldr r6, [sp, #24] //load psadframe
+    vadd.u32 d16, d17
+    vst1.32 {d16[0]}, [r6]
 
-	ldmia sp!, {r4-r8}
+    ldmia sp!, {r4-r8}
 
 WELS_ASM_FUNC_END
 
@@ -162,26 +162,26 @@
 
 #ifdef __APPLE__
 .macro SAD_SD_MAD_16BYTES
-	vld1.32 {q0}, [$0], $2
-	vld1.32 {q1}, [$1], $2
+    vld1.32 {q0}, [$0], $2
+    vld1.32 {q1}, [$1], $2
 
-	vpadal.u8 $3, q0
-	vpadal.u8 $4, q1
+    vpadal.u8 $3, q0
+    vpadal.u8 $4, q1
 
-	vabd.u8 q0, q0, q1
-	vmax.u8 $5, q0
-	vpadal.u8 $6, q0
+    vabd.u8 q0, q0, q1
+    vmax.u8 $5, q0
+    vpadal.u8 $6, q0
 .endm
 
 .macro SAD_SD_MAD_8x16BYTES
-	vld1.32 {q0}, [$0], $2
-	vld1.32 {q1}, [$1], $2
+    vld1.32 {q0}, [$0], $2
+    vld1.32 {q1}, [$1], $2
 
-	vpaddl.u8 q2, q0
-	vpaddl.u8 q3, q1
+    vpaddl.u8 q2, q0
+    vpaddl.u8 q3, q1
 
-	vabd.u8 $3, q0, q1
-	vpaddl.u8 $4, $3       //abs_diff
+    vabd.u8 $3, q0, q1
+    vpaddl.u8 $4, $3       //abs_diff
 
 
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
@@ -192,41 +192,41 @@
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
 
-	vsub.u16 $5, q2, q3
+    vsub.u16 $5, q2, q3
 .endm
 
 .macro SAD_SD_MAD_CALC
-	vpmax.u8 d0, $0, $1 //8bytes
-	vpmax.u8 d0, d0, d0 //4bytes
-	vpmax.u8 $2, d0, d0 //2bytes
+    vpmax.u8 d0, $0, $1 //8bytes
+    vpmax.u8 d0, d0, d0 //4bytes
+    vpmax.u8 $2, d0, d0 //2bytes
 
-	vpaddl.u16 $3, $3
-	vpaddl.u32 $3, $3
-	vpaddl.s16 $4, $4
-	vpaddl.s32 $4, $4
+    vpaddl.u16 $3, $3
+    vpaddl.u32 $3, $3
+    vpaddl.s16 $4, $4
+    vpaddl.s32 $4, $4
 .endm
 #else
 .macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
-	vld1.32 {q0}, [\arg0], \arg2
-	vld1.32 {q1}, [\arg1], \arg2
+    vld1.32 {q0}, [\arg0], \arg2
+    vld1.32 {q1}, [\arg1], \arg2
 
-	vpadal.u8 \arg3, q0
-	vpadal.u8 \arg4, q1
+    vpadal.u8 \arg3, q0
+    vpadal.u8 \arg4, q1
 
-	vabd.u8 q0, q0, q1
-	vmax.u8 \arg5, q0
-	vpadal.u8 \arg6, q0
+    vabd.u8 q0, q0, q1
+    vmax.u8 \arg5, q0
+    vpadal.u8 \arg6, q0
 .endm
 
 .macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
-	vld1.32 {q0}, [\arg0], \arg2
-	vld1.32 {q1}, [\arg1], \arg2
+    vld1.32 {q0}, [\arg0], \arg2
+    vld1.32 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q2, q0
-	vpaddl.u8 q3, q1
+    vpaddl.u8 q2, q0
+    vpaddl.u8 q3, q1
 
-	vabd.u8 \arg3, q0, q1
-	vpaddl.u8 \arg4, \arg3       //abs_diff
+    vabd.u8 \arg3, q0, q1
+    vpaddl.u8 \arg4, \arg3       //abs_diff
 
 
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
@@ -237,18 +237,18 @@
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
 
-	vsub.u16 \arg5, q2, q3
+    vsub.u16 \arg5, q2, q3
 .endm
 
 .macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
-	vpmax.u8 d0, \arg0, \arg1 //8bytes
-	vpmax.u8 d0, d0, d0 //4bytes
-	vpmax.u8 \arg2, d0, d0 //2bytes
+    vpmax.u8 d0, \arg0, \arg1 //8bytes
+    vpmax.u8 d0, d0, d0 //4bytes
+    vpmax.u8 \arg2, d0, d0 //2bytes
 
-	vpaddl.u16 \arg3, \arg3
-	vpaddl.u32 \arg3, \arg3
-	vpaddl.s16 \arg4, \arg4
-	vpaddl.s32 \arg4, \arg4
+    vpaddl.u16 \arg3, \arg3
+    vpaddl.u32 \arg3, \arg3
+    vpaddl.s16 \arg4, \arg4
+    vpaddl.s32 \arg4, \arg4
 .endm
 #endif
 
@@ -256,18 +256,18 @@
 
     stmdb sp!, {r4-r10}
 
-	ldr r4, [sp, #28] //load pic_stride
-	ldr r5, [sp, #36] //load psad8x8
+    ldr r4, [sp, #28] //load pic_stride
+    ldr r5, [sp, #36] //load psad8x8
     ldr r6, [sp, #40] //load psd8x8
     ldr r7, [sp, #44] //load pmad8x8
 
-	//Initial the Q4 register for save the "psadframe"
-	vmov.s64 q15, #0
+    //Initial the Q4 register for save the "psadframe"
+    vmov.s64 q15, #0
 
-	//Get the jump distance to use on loop codes
-	lsl r10, r4, #4
-	sub r9, r10, #16 //R9 keep the 16*pic_stride-16
-	sub r10, r2      //R10 keep the 16*pic_stride-pic_width
+    //Get the jump distance to use on loop codes
+    lsl r10, r4, #4
+    sub r9, r10, #16 //R9 keep the 16*pic_stride-16
+    sub r10, r2      //R10 keep the 16*pic_stride-pic_width
 
 vaa_calc_sad_bgd_loop0:
 
@@ -276,40 +276,40 @@
 
 vaa_calc_sad_bgd_loop1:
 
-	//Process the 16x16 bytes        pmad psad psd
-	SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
-	SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
+    //Process the 16x16 bytes        pmad psad psd
+    SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
+    SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
 
     SAD_SD_MAD_CALC d26, d27, d16, q11, q9
     SAD_SD_MAD_CALC d28, d29, d17, q12, q10
 
-	//Write to "psad8x8" buffer
-	vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
-	//Adjust the input address
-	sub r0, r9
-	sub r1, r9
-	//Write to "psd8x8" buffer
-	vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
-	subs r8, #16
-	//Write to "pmad8x8" buffer
-	vst2.16 {d16[0],d17[0]}, [r7]!
-	//Save to calculate "psadframe"
-	vadd.u32 q11, q12
-	vadd.u32 q15, q11
+    //Write to "psad8x8" buffer
+    vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
+    //Adjust the input address
+    sub r0, r9
+    sub r1, r9
+    //Write to "psd8x8" buffer
+    vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
+    subs r8, #16
+    //Write to "pmad8x8" buffer
+    vst2.16 {d16[0],d17[0]}, [r7]!
+    //Save to calculate "psadframe"
+    vadd.u32 q11, q12
+    vadd.u32 q15, q11
 
-	bne vaa_calc_sad_bgd_loop1
+    bne vaa_calc_sad_bgd_loop1
 
-	//Adjust the input address
-	add r0, r10
-	add r1, r10
+    //Adjust the input address
+    add r0, r10
+    add r1, r10
 
     subs r3, #16
-	bne vaa_calc_sad_bgd_loop0
+    bne vaa_calc_sad_bgd_loop0
 
-	ldr r8, [sp, #32] //load psadframe
-	vadd.u32 d30, d31
-	vst1.32 {d30[0]}, [r8]
-	ldmia sp!, {r4-r10}
+    ldr r8, [sp, #32] //load psadframe
+    vadd.u32 d30, d31
+    vst1.32 {d30[0]}, [r8]
+    ldmia sp!, {r4-r10}
 
 WELS_ASM_FUNC_END
 
@@ -316,344 +316,344 @@
 
 #ifdef __APPLE__
 .macro SSD_MUL_SUM_16BYTES_RESET
-	vmull.u8 $3, $0, $0
-	vpaddl.u16 $2, $3
+    vmull.u8 $3, $0, $0
+    vpaddl.u16 $2, $3
 
-	vmull.u8 $3, $1, $1
-	vpadal.u16 $2, $3
+    vmull.u8 $3, $1, $1
+    vpadal.u16 $2, $3
 .endm
 
 .macro SSD_MUL_SUM_16BYTES
-	vmull.u8 $3, $0, $0
-	vpadal.u16 $2, $3
+    vmull.u8 $3, $0, $0
+    vpadal.u16 $2, $3
 
-	vmull.u8 $3, $1, $1
-	vpadal.u16 $2, $3
+    vmull.u8 $3, $1, $1
+    vpadal.u16 $2, $3
 .endm
 
 .macro SAD_SSD_BGD_16
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [$1], $2 //load ref_row
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vld1.8 {q1}, [$1], $2 //load ref_row
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //the last row of a 16x16 block
 .macro SAD_SSD_BGD_16_end
-	vld1.8 {q0}, [$0], $1 //load cur_row
+    vld1.8 {q0}, [$0], $1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 $2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $2, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 8x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_8x8
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [$1], $2 //load ref_row
+    vld1.8 {q1}, [$1], $2 //load ref_row
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 16x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_16x16
-	vld1.8 {q0}, [$0], $2 //load cur_row
-	vld1.8 {q1}, [$1], $2 //load ref_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q1}, [$1], $2 //load ref_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [$1], $2 //load ref_row
+    vld1.8 {q1}, [$1], $2 //load ref_row
 
-	vpaddl.u8 q9, q0								//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                                //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for each 8x16 block
 .macro SAD_SSD_BGD_CALC_8x16
 
-	vpmax.u8 d10, d10, d11 //4 numbers
-	vpmax.u8 d10, d10, d10 //2 numbers
-	vpmax.u8 d10, d10, d10 //1 number1
+    vpmax.u8 d10, d10, d11 //4 numbers
+    vpmax.u8 d10, d10, d10 //2 numbers
+    vpmax.u8 d10, d10, d10 //1 number1
 
-	vmov $0, d10			//d26 d27 keeps the l_mad
+    vmov $0, d10            //d26 d27 keeps the l_mad
 
-	//p_sd8x8			fix me
-	vpaddl.u16 q3, q3
-	vpaddl.u16 q4, q4
+    //p_sd8x8           fix me
+    vpaddl.u16 q3, q3
+    vpaddl.u16 q4, q4
 
-	vsub.i32 $1, q3, q4
-	vpaddl.u32 $1, $1
+    vsub.i32 $1, q3, q4
+    vpaddl.u32 $1, $1
 
-	//psad8x8
-	vpaddl.u16 $2, $2
-	vpaddl.u32 $2, $2
+    //psad8x8
+    vpaddl.u16 $2, $2
+    vpaddl.u32 $2, $2
 
-	//psadframe
-	vadd.i32 q12, $2
+    //psadframe
+    vadd.i32 q12, $2
 .endm
 
 .macro SAD_SSD_BGD_16x16
-	//for one 8x16
-	SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
+    //for one 8x16
+    SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
 
-	SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+    SAD_SSD_BGD_CALC_8x16 d26, q14, q6
 
-	//for another 8x16
-	SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16_end $0, $2, q7
+    //for another 8x16
+    SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16_end $0, $2, q7
 
-	SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+    SAD_SSD_BGD_CALC_8x16 d27, q15, q7
 .endm
 
 .macro SSD_SAD_SD_MAD_PADDL
-	vpaddl.s16 $0, $0
-	vpaddl.s32 $0, $0
-	vadd.i32 $1, $1, $2
+    vpaddl.s16 $0, $0
+    vpaddl.s32 $0, $0
+    vadd.i32 $1, $1, $2
 .endm
 #else
 .macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
-	vmull.u8   \arg3, \arg0, \arg0
-	vpaddl.u16 \arg2, \arg3
+    vmull.u8   \arg3, \arg0, \arg0
+    vpaddl.u16 \arg2, \arg3
 
-	vmull.u8   \arg3, \arg1, \arg1
-	vpadal.u16 \arg2, \arg3
+    vmull.u8   \arg3, \arg1, \arg1
+    vpadal.u16 \arg2, \arg3
 .endm
 
 .macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
-	vmull.u8   \arg3, \arg0, \arg0
-	vpadal.u16 \arg2, \arg3
+    vmull.u8   \arg3, \arg0, \arg0
+    vpadal.u16 \arg2, \arg3
 
-	vmull.u8   \arg3, \arg1, \arg1
-	vpadal.u16 \arg2, \arg3
+    vmull.u8   \arg3, \arg1, \arg1
+    vpadal.u16 \arg2, \arg3
 .endm
 
 .macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //the last row of a 16x16 block
 .macro SAD_SSD_BGD_16_end arg0, arg1, arg2
-	vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 \arg2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg2, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 8x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 16x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
-	vpaddl.u8 q9, q0								//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                                //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for each 8x16 block
 .macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
 
-	vpmax.u8 d10, d10, d11 //4 numbers
-	vpmax.u8 d10, d10, d10 //2 numbers
-	vpmax.u8 d10, d10, d10 //1 number1
+    vpmax.u8 d10, d10, d11 //4 numbers
+    vpmax.u8 d10, d10, d10 //2 numbers
+    vpmax.u8 d10, d10, d10 //1 number1
 
-	vmov \arg0, d10			//d26 d27 keeps the l_mad
+    vmov \arg0, d10         //d26 d27 keeps the l_mad
 
-	//p_sd8x8
-	vpaddl.u16 q3, q3
-	vpaddl.u16 q4, q4
+    //p_sd8x8
+    vpaddl.u16 q3, q3
+    vpaddl.u16 q4, q4
 
-	vsub.i32 \arg1, q3, q4
-	vpaddl.u32 \arg1, \arg1
+    vsub.i32 \arg1, q3, q4
+    vpaddl.u32 \arg1, \arg1
 
-	//psad8x8
-	vpaddl.u16 \arg2, \arg2
-	vpaddl.u32 \arg2, \arg2
+    //psad8x8
+    vpaddl.u16 \arg2, \arg2
+    vpaddl.u32 \arg2, \arg2
 
-	//psadframe
-	vadd.i32 q12, \arg2
+    //psadframe
+    vadd.i32 q12, \arg2
 .endm
 
 .macro SAD_SSD_BGD_16x16 arg0, arg1, arg2
-	//for one 8x16
-	SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    //for one 8x16
+    SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
 
-	SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+    SAD_SSD_BGD_CALC_8x16 d26, q14, q6
 
-	//for another 8x16
-	SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16_end \arg0, \arg2, q7
+    //for another 8x16
+    SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16_end \arg0, \arg2, q7
 
-	SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+    SAD_SSD_BGD_CALC_8x16 d27, q15, q7
 .endm
 
 .macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
-	vpaddl.s16 \arg0, \arg0
-	vpaddl.s32 \arg0, \arg0
-	vadd.i32 \arg1, \arg1, \arg2
+    vpaddl.s16 \arg0, \arg0
+    vpaddl.s32 \arg0, \arg0
+    vadd.i32 \arg1, \arg1, \arg2
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
-	stmdb sp!, {r0-r12, r14}
-	vpush {q4-q7}
+    stmdb sp!, {r0-r12, r14}
+    vpush {q4-q7}
 
-	ldr r4, [sp, #120] //r4 keeps the pic_stride
+    ldr r4, [sp, #120] //r4 keeps the pic_stride
 
-	sub r5, r4, #1
-	lsl r5, r5, #4 //r5 keeps the little step
+    sub r5, r4, #1
+    lsl r5, r5, #4 //r5 keeps the little step
 
-	lsl r6, r4, #4
-	sub r6, r2, r6	//r6 keeps the big step
+    lsl r6, r4, #4
+    sub r6, r2, r6  //r6 keeps the big step
 
 
-	ldr r8, [sp, #128]//psad8x8
-	ldr r9, [sp, #132]//psum16x16
-	ldr r10, [sp, #136]//psqsum16x16
-	ldr r11, [sp, #140]//psqdiff16x16
-	ldr r12, [sp, #144]//p_sd8x8
-	ldr r14, [sp, #148]//p_mad8x8
+    ldr r8, [sp, #128]//psad8x8
+    ldr r9, [sp, #132]//psum16x16
+    ldr r10, [sp, #136]//psqsum16x16
+    ldr r11, [sp, #140]//psqdiff16x16
+    ldr r12, [sp, #144]//p_sd8x8
+    ldr r14, [sp, #148]//p_mad8x8
 
-	vmov.i8 q12, #0
+    vmov.i8 q12, #0
 
 vaa_calc_sad_ssd_bgd_height_loop:
 
@@ -660,7 +660,7 @@
     mov r7, r2
 vaa_calc_sad_ssd_bgd_width_loop:
 
-    //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff	q8, l_sum q9, l_sqsum q10
+    //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff  q8, l_sum q9, l_sqsum q10
     SAD_SSD_BGD_16x16 r0,r1,r4
 
     //psad8x8
@@ -694,20 +694,20 @@
 
     bne vaa_calc_sad_ssd_bgd_width_loop
 
-    sub r0, r0, r6		//jump to next 16 x width
-    sub r1, r1, r6		//jump to next 16 x width
+    sub r0, r0, r6      //jump to next 16 x width
+    sub r1, r1, r6      //jump to next 16 x width
 
     subs r3, #16
 bne vaa_calc_sad_ssd_bgd_height_loop
 
-	//psadframe
-	ldr r7, [sp, #124]//psadframe
+    //psadframe
+    ldr r7, [sp, #124]//psadframe
 
-	vadd.i32 d24, d24, d25
-	vst1.32 {d24[0]}, [r7]
+    vadd.i32 d24, d24, d25
+    vst1.32 {d24[0]}, [r7]
 
-	vpop {q4-q7}
-	ldmia sp!, {r0-r12, r14}
+    vpop {q4-q7}
+    ldmia sp!, {r0-r12, r14}
 
 WELS_ASM_FUNC_END
 
@@ -714,223 +714,223 @@
 
 #ifdef __APPLE__
 .macro SAD_VAR_16
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q1}, [$1], $2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_END
-	vld1.8 {q0}, [$0], $1 //load cur_row
+    vld1.8 {q0}, [$0], $1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 $2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $2, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_RESET_16x16
-	vld1.8 {q0}, [$0], $2 //load cur_row
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q1}, [$1], $2
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q1}, [$1], $2
 
-	vpaddl.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+    SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
 .endm
 
 .macro SAD_VAR_16_RESET_8x8
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q1}, [$1], $2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16x16
-	//for one 8x16
-	SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
+    //for one 8x16
+    SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16_END $0, $2, q7
+    //for another 8x16
+    SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16_END $0, $2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #else
 .macro SAD_VAR_16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_END arg0, arg1, arg2
-	vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 \arg2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg2, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 
 .macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+    SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
 .endm
 
 .macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16x16 arg0, arg1, arg2
-	//for one 8x16
-	SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    //for one 8x16
+    SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16_END \arg0, \arg2, q7
+    //for another 8x16
+    SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16_END \arg0, \arg2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
-	stmdb sp!, {r4-r11}
-	vpush {q4}
-	vpush {q6-q7}
+    stmdb sp!, {r4-r11}
+    vpush {q4}
+    vpush {q6-q7}
 
-	ldr r4, [sp, #80] //r4 keeps the pic_stride
+    ldr r4, [sp, #80] //r4 keeps the pic_stride
 
-	sub r5, r4, #1
-	lsl r5, r5, #4 //r5 keeps the little step
+    sub r5, r4, #1
+    lsl r5, r5, #4 //r5 keeps the little step
 
-	lsl r6, r4, #4
-	sub r6, r2, r6	//r6 keeps the big step
+    lsl r6, r4, #4
+    sub r6, r2, r6  //r6 keeps the big step
 
-	ldr r7,		[sp, #84]	//psadframe
-	ldr r8,		[sp, #88]	//psad8x8
-	ldr r9,		[sp, #92]	//psum16x16
-	ldr r10,	[sp, #96]	//psqsum16x16
+    ldr r7,     [sp, #84]   //psadframe
+    ldr r8,     [sp, #88]   //psad8x8
+    ldr r9,     [sp, #92]   //psum16x16
+    ldr r10,    [sp, #96]   //psqsum16x16
 
-	vmov.i8 q12, #0
+    vmov.i8 q12, #0
 vaa_calc_sad_var_height_loop:
 
     mov r11, r2
@@ -956,154 +956,154 @@
 
     bne vaa_calc_sad_var_width_loop
 
-    sub r0, r0, r6		//jump to next 16 x width
-    sub r1, r1, r6		//jump to next 16 x width
+    sub r0, r0, r6      //jump to next 16 x width
+    sub r1, r1, r6      //jump to next 16 x width
 
     subs r3, #16
 bne vaa_calc_sad_var_height_loop
 
-	vadd.i32 d24, d24, d25
-	vst1.32 {d24[0]}, [r7]
+    vadd.i32 d24, d24, d25
+    vst1.32 {d24[0]}, [r7]
 
-	vpop {q6-q7}
-	vpop {q4}
-	ldmia sp!, {r4-r11}
+    vpop {q6-q7}
+    vpop {q4}
+    ldmia sp!, {r4-r11}
 WELS_ASM_FUNC_END
 
 
 #ifdef __APPLE__
 .macro SAD_SSD_16
-	SAD_VAR_16 $0, $1, $2, $3
+    SAD_VAR_16 $0, $1, $2, $3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11
 .endm
 
 .macro SAD_SSD_16_END
-	SAD_VAR_16_END $0, $1, $2
+    SAD_VAR_16_END $0, $1, $2
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_16x16
-	SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
+    SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_8x8
-	SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
+    SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16x16
-	//for one 8x16
-	SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
+    //for one 8x16
+    SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16_END $0, $2, q7
+    //for another 8x16
+    SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16_END $0, $2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #else
 .macro SAD_SSD_16 arg0, arg1, arg2, arg3
-	SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
+    SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11
 .endm
 
 .macro SAD_SSD_16_END arg0, arg1, arg2
-	SAD_VAR_16_END \arg0, \arg1, \arg2
+    SAD_VAR_16_END \arg0, \arg1, \arg2
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
-	SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
+    SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
-	SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
+    SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16x16 arg0, arg1, arg2
-	//for one 8x16
-	SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    //for one 8x16
+    SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16_END \arg0, \arg2, q7
+    //for another 8x16
+    SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16_END \arg0, \arg2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
-	stmdb sp!, {r4-r12}
-	vpush {q4}
-	vpush {q6-q7}
+    stmdb sp!, {r4-r12}
+    vpush {q4}
+    vpush {q6-q7}
 
-	ldr r4, [sp, #84] //r4 keeps the pic_stride
+    ldr r4, [sp, #84] //r4 keeps the pic_stride
 
-	sub r5, r4, #1
-	lsl r5, r5, #4 //r5 keeps the little step
+    sub r5, r4, #1
+    lsl r5, r5, #4 //r5 keeps the little step
 
-	lsl r6, r4, #4
-	sub r6, r2, r6	//r6 keeps the big step
+    lsl r6, r4, #4
+    sub r6, r2, r6  //r6 keeps the big step
 
-	ldr r7,		[sp, #88]	//psadframe
-	ldr r8,		[sp, #92]	//psad8x8
-	ldr r9,		[sp, #96]	//psum16x16
-	ldr r10,	[sp, #100]	//psqsum16x16
-	ldr r11,	[sp, #104]	//psqdiff16x16
+    ldr r7,     [sp, #88]   //psadframe
+    ldr r8,     [sp, #92]   //psad8x8
+    ldr r9,     [sp, #96]   //psum16x16
+    ldr r10,    [sp, #100]  //psqsum16x16
+    ldr r11,    [sp, #104]  //psqdiff16x16
 
-	vmov.i8 q12, #0
+    vmov.i8 q12, #0
 vaa_calc_sad_ssd_height_loop:
 
     mov r12, r2
@@ -1136,18 +1136,18 @@
 
     bne vaa_calc_sad_ssd_width_loop
 
-    sub r0, r0, r6		//jump to next 16 x width
-    sub r1, r1, r6		//jump to next 16 x width
+    sub r0, r0, r6      //jump to next 16 x width
+    sub r1, r1, r6      //jump to next 16 x width
 
     subs r3, #16
-	bne vaa_calc_sad_ssd_height_loop
+    bne vaa_calc_sad_ssd_height_loop
 
-	vadd.i32 d24, d24, d25
-	vst1.32 {d24[0]}, [r7]
+    vadd.i32 d24, d24, d25
+    vst1.32 {d24[0]}, [r7]
 
-	vpop {q6-q7}
-	vpop {q4}
-	ldmia sp!, {r4-r12}
+    vpop {q6-q7}
+    vpop {q4}
+    ldmia sp!, {r4-r12}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -56,217 +56,217 @@
 ;***********************************************************************
 SECTION .text
 
-%macro WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
+%macro WEIGHT_LINE  9
+    movq        %2, %9
+    punpcklbw   %2, %7
+    movdqa      %8, %2
 
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
+    movdqa      %1, %6
+    psubusb     %1, %8
+    psubusb     %8, %6
+    por         %8, %1      ; ABS(curPixel - centerPixel);
 
-		movdqa		%1,	%3
-		psubusb		%1,	%8
+    movdqa      %1, %3
+    psubusb     %1, %8
 
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1
-		paddusw		%4,	%1
-		paddusw		%5,	%2
+    pmullw      %1, %1
+    psrlw       %1, 5
+    pmullw      %2, %1
+    paddusw     %4, %1
+    paddusw     %5, %2
 %endmacro
 
-%macro WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+%macro WEIGHT_LINE1_UV  4
+    movdqa      %2, %1
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 1
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 3
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    punpcklbw   %2, %4
+    paddw       %3, %2
 %endmacro
 
-%macro WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+%macro WEIGHT_LINE2_UV  4
+    movdqa      %2, %1
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 1
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    punpcklbw   %2, %4
+    psllw       %2, 2
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 3
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    punpcklbw   %2, %4
+    paddw       %3, %2
 %endmacro
 
-%macro WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+%macro WEIGHT_LINE3_UV  4
+    movdqa      %2, %1
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 1
+    punpcklbw   %2, %4
+    psllw       %2, 2
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    punpcklbw   %2, %4
+    pmullw      %2, [sse2_20]
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 3
+    punpcklbw   %2, %4
+    psllw       %2, 2
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 %endmacro
 
 ;***********************************************************************
 ;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
 ;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
+;   1   2   3
+;   4   0   5
+;   6   7   8
+;   0:  the center point
 
 WELS_EXTERN BilateralLumaFilter8_sse2
 
-        push r3
-        %assign push_num 1
-        LOAD_2_PARA
-        PUSH_XMM 8
+    push r3
+    %assign push_num 1
+    LOAD_2_PARA
+    PUSH_XMM 8
 
-		pxor		xmm7,	xmm7
+    pxor        xmm7,   xmm7
 
-		mov         r3,     r0
+    mov         r3,     r0
 
-		movq        xmm6,   [r0]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
+    movq        xmm6,   [r0]
+    punpcklbw   xmm6,   xmm7
+    movdqa      xmm3,   [sse2_32]
+    pxor        xmm4,   xmm4        ; nTotWeight
+    pxor        xmm5,   xmm5        ; nSum
 
-        dec         r0
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 5
+    dec         r0
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]           ; pixel 4
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]       ; pixel 5
 
-		sub			r0,	r1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 3
+    sub         r0, r1
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]           ; pixel 1
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]       ; pixel 2
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]       ; pixel 3
 
-		lea			r0,	[r0 + r1 * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 8
+    lea         r0, [r0 + r1 * 2]
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]           ; pixel 6
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]       ; pixel 7
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]       ; pixel 8
 
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[r3],	xmm5
+    pcmpeqw     xmm0,   xmm0
+    psrlw       xmm0,   15
+    psllw       xmm0,   8
+    psubusw     xmm0,   xmm4
+    pmullw      xmm0,   xmm6
+    paddusw     xmm5,   xmm0
+    psrlw       xmm5,   8
+    packuswb    xmm5,   xmm5
+    movq        [r3],   xmm5
 
 
-		POP_XMM
-		pop r3
-		%assign push_num 0
+    POP_XMM
+    pop r3
+    %assign push_num 0
 
-		ret
+    ret
 
 ;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+; void      WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
 ;***********************************************************************
 ;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
+;1  1   2   1   1
+;1  2   4   2   1
+;2  4   20  4   2
+;1  2   4   2   1
+;1  1   2   1   1
 
 WELS_EXTERN WaverageChromaFilter8_sse2
 
-        push r3
+    push r3
 
-        %assign push_num 1
+    %assign push_num 1
 
-        LOAD_2_PARA
+    LOAD_2_PARA
 
-        mov		r3,	r1
-		add		r3,	r3
-		sub		r0,	r3			; pixels - 2 * stride
-		sub		r0,	2
+    mov     r3, r1
+    add     r3, r3
+    sub     r0, r3          ; pixels - 2 * stride
+    sub     r0, 2
 
-		pxor	xmm0,	xmm0
-		pxor	xmm3,	xmm3
+    pxor    xmm0,   xmm0
+    pxor    xmm3,   xmm3
 
-		movdqu		xmm1,	[r0]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0]
+    WEIGHT_LINE1_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		movdqu		xmm1,	[r0 + r1]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0 + r1]
+    WEIGHT_LINE2_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		add		r0,	r3
-		movdqu		xmm1,	[r0]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+    add     r0, r3
+    movdqu      xmm1,   [r0]
+    WEIGHT_LINE3_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		movdqu		xmm1,	[r0 + r1]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0 + r1]
+    WEIGHT_LINE2_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		movdqu		xmm1,	[r0 + r1 * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0 + r1 * 2]
+    WEIGHT_LINE1_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[r0 + 2],		xmm3
+    psrlw       xmm3,       6
+    packuswb    xmm3,       xmm3
+    movq        [r0 + 2],       xmm3
 
 
-        pop r3
+    pop r3
 
-        %assign push_num 0
-		ret
+    %assign push_num 0
+    ret
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -29,13 +29,13 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	upsampling.asm
+;*  upsampling.asm
 ;*
 ;*  Abstract
-;*		SIMD for pixel domain down sampling
+;*      SIMD for pixel domain down sampling
 ;*
 ;*  History
-;*		10/22/2009	Created
+;*      10/22/2009  Created
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -61,9 +61,9 @@
 
 ALIGN 16
 shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+    db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
 shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+    db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
 
 
 ;***********************************************************************
@@ -73,737 +73,737 @@
 SECTION .text
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01			; iSrcHeight >> 1
+    sar ebp, $01            ; iSrcHeight >> 1
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01            ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg ebx             ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 32 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [esi]         ; 1st pSrc line
+    movq mm1, [esi+8]       ; 1st pSrc line + 8
+    movq mm2, [esi+ecx]     ; 2nd pSrc line
+    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8
 
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+    ; to handle mm0, mm1, mm2, mm3
+    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm4, mm5      ; d c D C b a B A
+    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
 
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm5, mm6      ; h g H G f e F E
+    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
 
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
+    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
+    punpcklbw mm6, mm7      ; l k L K j i J I
+    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6
 
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
+    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
+    punpcklbw mm7, mm0      ; p o P O n m N M
+    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7
 
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
+    ; to handle mm4, mm5, mm6, mm7
+    movq mm0, mm4       ;
+    punpckldq mm0, mm5  ; H G F E D C B A
+    punpckhdq mm4, mm5  ; h g f e d c b a
 
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
+    movq mm1, mm6
+    punpckldq mm1, mm7  ; P O N M L K J I
+    punpckhdq mm6, mm7  ; p o n m l k j i
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+    ; 2nd part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm1, [esi+16]      ; 1st pSrc line + 16
+    movq mm2, [esi+24]      ; 1st pSrc line + 24
+    movq mm3, [esi+ecx+16]  ; 2nd pSrc line + 16
+    movq mm4, [esi+ecx+24]  ; 2nd pSrc line + 24
 
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+    ; to handle mm1, mm2, mm3, mm4
+    pshufw mm5, mm1, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm6, mm5, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm5, mm6      ; d c D C b a B A
+    pshufw mm5, mm5, 0d8h   ; d c b a D C B A ; 11011000 B: mm5
 
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+    pshufw mm6, mm2, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm7, mm6, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm6, mm7      ; h g H G f e F E
+    pshufw mm6, mm6, 0d8h   ; h g f e H G F E ; 11011000 B: mm6
 
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+    pshufw mm7, mm3, 0d8h   ; l L j J k K i I ; 11011000 B
+    pshufw mm1, mm7, 04eh   ; k K i I l L j J ; 01001110 B
+    punpcklbw mm7, mm1      ; l k L K j i J I
+    pshufw mm7, mm7, 0d8h   ; l k j i L K J I ; 11011000 B: mm7
 
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+    pshufw mm1, mm4, 0d8h   ; p P n N o O m M ; 11011000 B
+    pshufw mm2, mm1, 04eh   ; o O m M p P n N ; 01001110 B
+    punpcklbw mm1, mm2      ; p o P O n m N M
+    pshufw mm1, mm1, 0d8h   ; p o n m P O N M ; 11011000 B: mm1
 
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
+    ; to handle mm5, mm6, mm7, mm1
+    movq mm2, mm5
+    punpckldq mm2, mm6  ; H G F E D C B A
+    punpckhdq mm5, mm6  ; h g f e d c b a
 
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
+    movq mm3, mm7
+    punpckldq mm3, mm1  ; P O N M L K J I
+    punpckhdq mm7, mm1  ; p o n m l k j i
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm2, mm5      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm3, mm7      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
 
-	movq [edi  ], mm0
-	movq [edi+8], mm2
+    movq [edi  ], mm0
+    movq [edi+8], mm2
 
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
+    ; next SMB
+    lea esi, [esi+32]
+    lea edi, [edi+16]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    WELSEMMS
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
+    sar ebp, $01        ; iSrcHeight >> 1
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 16 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [esi]         ; 1st pSrc line
+    movq mm1, [esi+8]       ; 1st pSrc line + 8
+    movq mm2, [esi+ecx]     ; 2nd pSrc line
+    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8
 
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+    ; to handle mm0, mm1, mm2, mm3
+    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm4, mm5      ; d c D C b a B A
+    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
 
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm5, mm6      ; h g H G f e F E
+    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
 
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
+    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
+    punpcklbw mm6, mm7      ; l k L K j i J I
+    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6
 
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
+    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
+    punpcklbw mm7, mm0      ; p o P O n m N M
+    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7
 
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
+    ; to handle mm4, mm5, mm6, mm7
+    movq mm0, mm4       ;
+    punpckldq mm0, mm5  ; H G F E D C B A
+    punpckhdq mm4, mm5  ; h g f e d c b a
 
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
+    movq mm1, mm6
+    punpckldq mm1, mm7  ; P O N M L K J I
+    punpckhdq mm6, mm7  ; p o n m l k j i
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	movq [edi  ], mm0
+    movq [edi  ], mm0
 
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
+    ; next SMB
+    lea esi, [esi+16]
+    lea edi, [edi+8]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    WELSEMMS
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
+    sar ebp, $01        ; iSrcHeight >> 1
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $02		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $02        ; (iSrcWidth >> 1) / 4      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 8 bytes
 .xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+ecx]		; 2nd pSrc line
+    ; 1st part horizonal loop: x8 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A
+    ;2nd Line Src:  mm1: h H g G f F e E
+    ;=> target:
+    ;: H G F E D C B A
+    ;: h g f e d c b a
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [esi]         ; 1st pSrc line
+    movq mm1, [esi+ecx]     ; 2nd pSrc line
 
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+    ; to handle mm0, mm1, mm2, mm3
+    pshufw mm2, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm3, mm2, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm2, mm3      ; d c D C b a B A
+    pshufw mm2, mm2, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
 
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+    pshufw mm4, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm5, mm4, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm4, mm5      ; h g H G f e F E
+    pshufw mm4, mm4, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
 
-	; to handle mm2, mm4
-	movq mm0, mm2		;
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
+    ; to handle mm2, mm4
+    movq mm0, mm2       ;
+    punpckldq mm0, mm4  ; H G F E D C B A
+    punpckhdq mm2, mm4  ; h g f e d c b a
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm0, mm2      ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+    pshufw mm1, mm0, 04eh   ; 01001110 B
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	movd [edi],	mm0
+    movd [edi], mm0
 
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
+    ; next unit
+    lea esi, [esi+8]
+    lea edi, [edi+4]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    WELSEMMS
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 
 
 ; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01			; iSrcHeight >> 1
+    sar ebp, $01            ; iSrcHeight >> 1
 
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01            ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg ebx             ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 32 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
+    ;               xmm1: p P o O n N m M l L k K j J i I
+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
+    ;               xmm3: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: P O N M L K J I H G F E D C B A
+    ;: p o n m l k j i h g f e d c b a
+    ;: P ..                          A
+    ;: p ..                          a
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movdqa xmm0, [esi]          ; 1st_src_line
+    movdqa xmm1, [esi+16]       ; 1st_src_line + 16
+    movdqa xmm2, [esi+ecx]      ; 2nd_src_line
+    movdqa xmm3, [esi+ecx+16]   ; 2nd_src_line + 16
 
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
+    ; packing & avg
+    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    ; another implementation for xmm4 high bits
+;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm4
 
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
+    movdqa xmm5, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm1
+;   psrlw xmm5, 8
+    pavgb xmm1, xmm5
 
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
+    movdqa xmm4, xmm2
+    pshufb xmm2, xmm7
+    pshufb xmm4, xmm6
+;   psubb xmm4, xmm2
+;   psrlw xmm4, 8
+    pavgb xmm2, xmm4
 
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
+    movdqa xmm5, xmm3
+    pshufb xmm3, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm3
+;   psrlw xmm5, 8
+    pavgb xmm3, xmm5
 
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
+    packuswb xmm0, xmm1
+    packuswb xmm2, xmm3
+    pavgb xmm0, xmm2
 
-	; write pDst
-	movdqa [edi], xmm0
+    ; write pDst
+    movdqa [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
+    ; next SMB
+    lea esi, [esi+32]
+    lea edi, [edi+16]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    sar ebp, $01        ; iSrcHeight >> 1
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 16 bytes
 .xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
+    ; horizonal loop: x16 bytes by source
+    ;               mem  hi<-       ->lo
+    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movdqa xmm0, [esi]          ; 1st_src_line
+    movdqa xmm1, [esi+ecx]      ; 2nd_src_line
 
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
+    ; packing & avg
+    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    ; another implementation for xmm2 high bits
+;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm2
 
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
+    movdqa xmm3, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm3, xmm6
+;   psubb xmm3, xmm1
+;   psrlw xmm3, 8
+    pavgb xmm1, xmm3
 
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
+    pavgb xmm0, xmm1
+    packuswb xmm0, xmm1
 
-	; write pDst
-	movq [edi], xmm0
+    ; write pDst
+    movq [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
+    ; next SMB
+    lea esi, [esi+16]
+    lea edi, [edi+8]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx32_sse4(    unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01			; iSrcHeight >> 1
+    sar ebp, $01            ; iSrcHeight >> 1
 
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01            ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg ebx             ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 32 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
+    ;               xmm1: p P o O n N m M l L k K j J i I
+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
+    ;               xmm3: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: P O N M L K J I H G F E D C B A
+    ;: p o n m l k j i h g f e d c b a
+    ;: P ..                          A
+    ;: p ..                          a
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movntdqa xmm0, [esi]            ; 1st_src_line
+    movntdqa xmm1, [esi+16]     ; 1st_src_line + 16
+    movntdqa xmm2, [esi+ecx]        ; 2nd_src_line
+    movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
 
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
+    ; packing & avg
+    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm4
 
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
+    movdqa xmm5, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm1
+;   psrlw xmm5, 8
+    pavgb xmm1, xmm5
 
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
+    movdqa xmm4, xmm2
+    pshufb xmm2, xmm7
+    pshufb xmm4, xmm6
+;   psubb xmm4, xmm2
+;   psrlw xmm4, 8
+    pavgb xmm2, xmm4
 
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
+    movdqa xmm5, xmm3
+    pshufb xmm3, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm3
+;   psrlw xmm5, 8
+    pavgb xmm3, xmm5
 
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
+    packuswb xmm0, xmm1
+    packuswb xmm2, xmm3
+    pavgb xmm0, xmm2
 
-	; write pDst
-	movdqa [edi], xmm0
+    ; write pDst
+    movdqa [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
+    ; next SMB
+    lea esi, [esi+32]
+    lea edi, [edi+16]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    sar ebp, $01        ; iSrcHeight >> 1
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 16 bytes
 .xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
+    ; horizonal loop: x16 bytes by source
+    ;               mem  hi<-       ->lo
+    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movntdqa xmm0, [esi]            ; 1st_src_line
+    movntdqa xmm1, [esi+ecx]        ; 2nd_src_line
 
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
+    ; packing & avg
+    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm2
 
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
+    movdqa xmm3, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm3, xmm6
+;   psubb xmm3, xmm1
+;   psrlw xmm3, 8
+    pavgb xmm1, xmm3
 
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
+    pavgb xmm0, xmm1
+    packuswb xmm0, xmm1
 
-	; write pDst
-	movq [edi], xmm0
+    ; write pDst
+    movq [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
+    ; next SMB
+    lea esi, [esi+16]
+    lea edi, [edi+8]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 
 
@@ -811,202 +811,202 @@
 
 ;**************************************************************************************************************
 ;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
 ;                           unsigned int uiScaleX, unsigned int uiScaleY );
 ;{
 ;**************************************************************************************************************
 
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+%define     pushsize    16
+%define     localsize   28
+%define     pDstData        esp + pushsize + localsize + 4
+%define     dwDstStride     esp + pushsize + localsize + 8
+%define     dwDstWidth      esp + pushsize + localsize + 12
+%define     dwDstHeight     esp + pushsize + localsize + 16
+%define     pSrcData        esp + pushsize + localsize + 20
+%define     dwSrcStride     esp + pushsize + localsize + 24
+%define     dwSrcWidth      esp + pushsize + localsize + 28
+%define     dwSrcHeight     esp + pushsize + localsize + 32
+%define     scale           esp + 0
+%define     uiScaleX            esp + pushsize + localsize + 36
+%define     uiScaleY            esp + pushsize + localsize + 40
+%define     tmpHeight       esp + 12
+%define     yInverse        esp + 16
+%define     xInverse        esp + 20
+%define     dstStep         esp + 24
+    sub     esp,            localsize
 
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+    pxor    xmm0,   xmm0
+    mov     edx,    32767
+    mov     eax,    [uiScaleX]
+    and     eax,    32767
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    32767
+    movd    xmm1,       eax                     ; uinc(uiScaleX mod 32767)
+    movd    xmm2,       ebx                     ; -uinc
+    psllq   xmm1,       32
+    por     xmm1,       xmm2                    ; 0 0  uinc  -uinc   (dword)
+    pshufd  xmm7,       xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
 
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+    mov     eax,    [uiScaleY]
+    and     eax,    32767
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    32767
+    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       ebx                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
+    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
 
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+    mov     edx,        40003fffh
+    movd    xmm5,       edx
+    punpcklwd   xmm5,   xmm0                    ; 16384 16383
+    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
 
 
 DOWNSAMPLE:
 
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
+    mov     eax,            [dwDstHeight]
+    mov     edi,            [pDstData]
+    mov     edx,            [dwDstStride]
+    mov     ecx,            [dwDstWidth]
+    sub     edx,            ecx
+    mov     [dstStep],  edx             ; stride - width
+    dec     eax
+    mov     [tmpHeight],    eax
+    mov     eax,            16384
+    mov     [yInverse],     eax
 
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
 
 HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
+    mov     ebp,    esi
+    add     ebp,    [dwSrcStride]
 
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
+    mov     eax,        16384
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
+    dec     ecx
 
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
 
 WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
+    mov     eax,        [xInverse]
+    shr     eax,        15
 
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+    movd    xmm1,       [esi+eax]       ; xxxxxxba
+    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
+    pxor    xmm0,       xmm0
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
+    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
 
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    movdqa  xmm0,   xmm2
+    pmuludq xmm2,   xmm1
+    psrlq   xmm0,   32
+    psrlq   xmm1,   32
+    pmuludq xmm0,   xmm1
+    paddq   xmm2,   xmm0
+    pshufd  xmm1,   xmm2,   00001110b
+    paddq   xmm2,   xmm1
+    psrlq   xmm2,   29
 
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
+    movd    eax,    xmm2
+    inc     eax
+    shr     eax,    1
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
+    paddw   xmm3,       xmm7            ; inc u
+    psllw   xmm3,       1
+    psrlw   xmm3,       1
 
-	loop	WIDTH
+    loop    WIDTH
 
 WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
+    mov     eax,        [xInverse]
+    shr     eax,        15
+    mov     cl,         [esi+eax]
+    mov     [edi],      cl
+    inc     edi
 
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
+    mov     eax,        [uiScaleY]
+    add     [yInverse], eax
+    add     edi,        [dstStep]
 
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
 
-	dec		dword [tmpHeight]
-	jg		HEIGHT
+    dec     dword [tmpHeight]
+    jg      HEIGHT
 
 
 LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
 
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
+    mov     eax,        16384
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
 
 LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
+    mov     eax,        [xInverse]
+    shr     eax,        15
 
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
+    mov     al,         [esi+eax]
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	loop	LAST_ROW_WIDTH
+    loop    LAST_ROW_WIDTH
 
 LAST_ROW_END:
 
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
+    add     esp,            localsize
+    pop     ebx
+    pop     edi
+    pop     esi
+    pop     ebp
+%undef      pushsize
+%undef      localsize
+%undef      pSrcData
+%undef      dwSrcWidth
+%undef      dwSrcHeight
+%undef      dwSrcStride
+%undef      pDstData
+%undef      dwDstWidth
+%undef      dwDstHeight
+%undef      dwDstStride
+%undef      scale
+%undef      uiScaleX
+%undef      uiScaleY
+%undef      tmpHeight
+%undef      yInverse
+%undef      xInverse
+%undef      dstStep
+    ret
 
 
 
@@ -1013,193 +1013,193 @@
 
 ;**************************************************************************************************************
 ;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
 ;               unsigned int uiScaleX, unsigned int uiScaleY );
 ;{
 ;**************************************************************************************************************
 
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+%define     pushsize    16
+%define     localsize   28
+%define     pDstData        esp + pushsize + localsize + 4
+%define     dwDstStride     esp + pushsize + localsize + 8
+%define     dwDstWidth      esp + pushsize + localsize + 12
+%define     dwDstHeight     esp + pushsize + localsize + 16
+%define     pSrcData        esp + pushsize + localsize + 20
+%define     dwSrcStride     esp + pushsize + localsize + 24
+%define     dwSrcWidth      esp + pushsize + localsize + 28
+%define     dwSrcHeight     esp + pushsize + localsize + 32
+%define     scale           esp + 0
+%define     uiScaleX            esp + pushsize + localsize + 36
+%define     uiScaleY            esp + pushsize + localsize + 40
+%define     tmpHeight       esp + 12
+%define     yInverse        esp + 16
+%define     xInverse        esp + 20
+%define     dstStep         esp + 24
+    sub     esp,            localsize
 
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+    pxor    xmm0,   xmm0
+    mov     edx,    65535
+    mov     eax,    [uiScaleX]
+    and     eax,    edx
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    65535
+    movd    xmm1,       eax                     ; uinc(uiScaleX mod 65536)
+    movd    xmm2,       ebx                     ; -uinc
+    psllq   xmm1,       32
+    por     xmm1,       xmm2                    ; 0 uinc 0 -uinc
+    pshuflw xmm7,       xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
 
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+    mov     eax,    [uiScaleY]
+    and     eax,    32767
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    32767
+    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       ebx                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
+    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
 
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
+    mov     edx,        80007fffh               ; 32768 32767
+    movd    xmm5,       edx
+    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
+    mov     ebx,        16384
 
 
 FAST_DOWNSAMPLE:
 
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
+    mov     eax,            [dwDstHeight]
+    mov     edi,            [pDstData]
+    mov     edx,            [dwDstStride]
+    mov     ecx,            [dwDstWidth]
+    sub     edx,            ecx
+    mov     [dstStep],  edx             ; stride - width
+    dec     eax
+    mov     [tmpHeight],    eax
+    mov     eax,        16384
+    mov     [yInverse],     eax
 
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+    pshuflw xmm4,       xmm5,   01010000b
+    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
 
 FAST_HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
+    mov     ebp,    esi
+    add     ebp,    [dwSrcStride]
 
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
+    mov     eax,        32768
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
+    dec     ecx
 
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
 
 FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
+    mov     eax,        [xInverse]
+    shr     eax,        16
 
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+    movd    xmm1,       [esi+eax]       ; xxxxxxba
+    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
 
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    pmaddwd     xmm2,   xmm1
+    pshufd  xmm1,   xmm2,   00000001b
+    paddd   xmm2,   xmm1
+    movd    xmm1,   ebx
+    paddd   xmm2,   xmm1
+    psrld   xmm2,   15
 
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
+    packuswb    xmm2,   xmm0
+    movd    eax,    xmm2
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	paddw	xmm3,		xmm7			; inc u
+    paddw   xmm3,       xmm7            ; inc u
 
-	loop	FAST_WIDTH
+    loop    FAST_WIDTH
 
 FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
+    mov     eax,        [xInverse]
+    shr     eax,        16
+    mov     cl,         [esi+eax]
+    mov     [edi],      cl
+    inc     edi
 
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
+    mov     eax,        [uiScaleY]
+    add     [yInverse], eax
+    add     edi,        [dstStep]
 
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
 
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
+    dec     dword [tmpHeight]
+    jg      FAST_HEIGHT
 
 
 FAST_LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
 
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
+    mov     eax,        32768
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
 
 FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
+    mov     eax,        [xInverse]
+    shr     eax,        16
 
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
+    mov     al,         [esi+eax]
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	loop	FAST_LAST_ROW_WIDTH
+    loop    FAST_LAST_ROW_WIDTH
 
 FAST_LAST_ROW_END:
 
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
+    add     esp,            localsize
+    pop     ebx
+    pop     edi
+    pop     esi
+    pop     ebp
+%undef      pushsize
+%undef      localsize
+%undef      pSrcData
+%undef      dwSrcWidth
+%undef      dwSrcHeight
+%undef      dwSrcStride
+%undef      pDstData
+%undef      dwDstWidth
+%undef      dwDstHeight
+%undef      dwDstStride
+%undef      scale
+%undef      uiScaleX
+%undef      uiScaleY
+%undef      tmpHeight
+%undef      yInverse
+%undef      xInverse
+%undef      dstStep
+    ret
 %endif
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -48,100 +48,100 @@
 ; Macros and other preprocessor constants
 ;***********************************************************************
 %macro SUM_SQR_SSE2     3       ; dst, pSrc, zero
-  movdqa %1, %2
-  punpcklbw %1, %3
-  punpckhbw %2, %3
-  pmaddwd %1, %1
-  pmaddwd %2, %2
-  paddd %1, %2
-  pshufd %2, %1, 04Eh   ; 01001110 B
-  paddd %1, %2
-  pshufd %2, %1, 0B1h   ; 10110001 B
-  paddd %1, %2
+    movdqa %1, %2
+    punpcklbw %1, %3
+    punpckhbw %2, %3
+    pmaddwd %1, %1
+    pmaddwd %2, %2
+    paddd %1, %2
+    pshufd %2, %1, 04Eh   ; 01001110 B
+    paddd %1, %2
+    pshufd %2, %1, 0B1h   ; 10110001 B
+    paddd %1, %2
 %endmacro       ; END OF SUM_SQR_SSE2
 
 %macro WELS_SAD_16x2_SSE2  3 ;esi :%1 edi:%2 ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   [%1+%3]
-  movdqa        xmm4,   [%2+%3]
-  psadbw        xmm1,   xmm2
-  psadbw        xmm3,   xmm4
-  paddd xmm6,   xmm1
-  paddd xmm6,   xmm3
-  lea           %1,     [%1+%3*2]
-  lea           %2,     [%2+%3*2]
+    movdqa        xmm1,   [%1]
+    movdqa        xmm2,   [%2]
+    movdqa        xmm3,   [%1+%3]
+    movdqa        xmm4,   [%2+%3]
+    psadbw        xmm1,   xmm2
+    psadbw        xmm3,   xmm4
+    paddd xmm6,   xmm1
+    paddd xmm6,   xmm3
+    lea           %1,     [%1+%3*2]
+    lea           %2,     [%2+%3*2]
 %endmacro
 
 ; by comparing it outperforms than phaddw(SSSE3) sets
 %macro SUM_WORD_8x2_SSE2        2       ; dst(pSrc), tmp
-  ; @sum_8x2 begin
-  pshufd %2, %1, 04Eh   ; 01001110 B
-  paddw %1, %2
-  pshuflw %2, %1, 04Eh  ; 01001110 B
-  paddw %1, %2
-  pshuflw %2, %1, 0B1h  ; 10110001 B
-  paddw %1, %2
-  ; end of @sum_8x2
+    ; @sum_8x2 begin
+    pshufd %2, %1, 04Eh   ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 04Eh  ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 0B1h  ; 10110001 B
+    paddw %1, %2
+    ; end of @sum_8x2
 %endmacro       ; END of SUM_WORD_8x2_SSE2
 
 %macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm2
-  paddd         xmm6,   xmm3
+    movdqa        xmm1,   [%1]
+    movdqa        xmm2,   [%2]
+    movdqa        xmm3,   xmm1
+    psadbw        xmm3,   xmm2
+    paddd         xmm6,   xmm3
 
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm0
-  paddd         xmm5,   xmm3
+    movdqa        xmm3,   xmm1
+    psadbw        xmm3,   xmm0
+    paddd         xmm5,   xmm3
 
-  movdqa        xmm2,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm2,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm2,   xmm2
-  paddd         xmm4,   xmm1
-  paddd         xmm4,   xmm2
+    movdqa        xmm2,   xmm1
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm2,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm2,   xmm2
+    paddd         xmm4,   xmm1
+    paddd         xmm4,   xmm2
 
-  add           %1,     %3
-  add           %2,     %3
+    add           %1,     %3
+    add           %2,     %3
 %endmacro
 
 %macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm2
-  paddd         xmm7,   xmm3    ; sad
+    movdqa        xmm1,   [%1]
+    movdqa        xmm2,   [%2]
+    movdqa        xmm3,   xmm1
+    psadbw        xmm3,   xmm2
+    paddd         xmm7,   xmm3    ; sad
 
-  movdqa        xmm3,   xmm1
-  pmaxub        xmm3,   xmm2
-  pminub        xmm2,   xmm1
-  psubb xmm3,   xmm2    ; diff
+    movdqa        xmm3,   xmm1
+    pmaxub        xmm3,   xmm2
+    pminub        xmm2,   xmm1
+    psubb xmm3,   xmm2    ; diff
 
-  movdqa        xmm2,   xmm1
-  psadbw        xmm2,   xmm0
-  paddd xmm6,   xmm2    ; sum
+    movdqa        xmm2,   xmm1
+    psadbw        xmm2,   xmm0
+    paddd xmm6,   xmm2    ; sum
 
-  movdqa                xmm2,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm2,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm2,   xmm2
-  paddd         xmm5,   xmm1
-  paddd         xmm5,   xmm2    ; sqsum
+    movdqa                xmm2,   xmm1
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm2,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm2,   xmm2
+    paddd         xmm5,   xmm1
+    paddd         xmm5,   xmm2    ; sqsum
 
-  movdqa                xmm1,   xmm3
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm3,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm3,   xmm3
-  paddd         xmm4,   xmm1
-  paddd         xmm4,   xmm3    ; sqdiff
+    movdqa                xmm1,   xmm3
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm3,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm3,   xmm3
+    paddd         xmm4,   xmm1
+    paddd         xmm4,   xmm3    ; sqdiff
 
-  add           %1,     %3
-  add           %2,     %3
+    add           %1,     %3
+    add           %2,     %3
 %endmacro
 
 %macro WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
@@ -149,40 +149,40 @@
 %define sum_cur_reg             %2
 %define sum_ref_reg             %3
 %define mad_reg                 %4
-  movdqa        xmm1,           [%5]
-  movdqa        xmm2,           [%6]
-  movdqa        xmm3,           xmm1
-  psadbw        xmm3,           xmm0
-  paddd         sum_cur_reg,    xmm3    ; sum_cur
-  movdqa        xmm3,           xmm2
-  psadbw        xmm3,           xmm0
-  paddd sum_ref_reg,                    xmm3    ; sum_ref
+    movdqa        xmm1,           [%5]
+    movdqa        xmm2,           [%6]
+    movdqa        xmm3,           xmm1
+    psadbw        xmm3,           xmm0
+    paddd         sum_cur_reg,    xmm3    ; sum_cur
+    movdqa        xmm3,           xmm2
+    psadbw        xmm3,           xmm0
+    paddd sum_ref_reg,                    xmm3    ; sum_ref
 
-  movdqa        xmm3,           xmm1
-  pmaxub        xmm3,           xmm2
-  pminub        xmm2,           xmm1
-  psubb xmm3,           xmm2    ; abs diff
-  pmaxub        mad_reg,        xmm3    ; max abs diff
+    movdqa        xmm3,           xmm1
+    pmaxub        xmm3,           xmm2
+    pminub        xmm2,           xmm1
+    psubb xmm3,           xmm2    ; abs diff
+    pmaxub        mad_reg,        xmm3    ; max abs diff
 
-  psadbw        xmm3,           xmm0
-  paddd sad_reg,        xmm3    ; sad
+    psadbw        xmm3,           xmm0
+    paddd sad_reg,        xmm3    ; sad
 
-  add                   %5,             %7
-  add                   %6,             %7
+    add                   %5,             %7
+    add                   %6,             %7
 %endmacro
 
 
 %macro WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
 %define max_reg  %1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           4
-  pmaxub        max_reg,        xmm1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           2
-  pmaxub        max_reg,        xmm1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           1
-  pmaxub        max_reg,        xmm1
+    movdqa        xmm1,           max_reg
+    psrldq        xmm1,           4
+    pmaxub        max_reg,        xmm1
+    movdqa        xmm1,           max_reg
+    psrldq        xmm1,           2
+    pmaxub        max_reg,        xmm1
+    movdqa        xmm1,           max_reg
+    psrldq        xmm1,           1
+    pmaxub        max_reg,        xmm1
 %endmacro
 
 %macro WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
@@ -190,50 +190,50 @@
 %define sum_reg         %2
 %define mad_reg         %3
 %define sqdiff_reg      %4
-  movdqa                xmm1,           [%5]
-  movdqa                xmm2,           xmm1
-  movdqa                xmm3,           xmm1
-  punpcklbw     xmm2,           xmm0
-  punpckhbw     xmm3,           xmm0
-  pmaddwd               xmm2,           xmm2
-  pmaddwd               xmm3,           xmm3
-  paddd         xmm2,           xmm3
-  movdqa                xmm3,           xmm2
-  psllq         xmm2,           32
-  psrlq         xmm3,           32
-  psllq         xmm3,           32
-  paddd         xmm2,           xmm3
-  paddd         sad_reg,        xmm2            ; sqsum
+    movdqa                xmm1,           [%5]
+    movdqa                xmm2,           xmm1
+    movdqa                xmm3,           xmm1
+    punpcklbw     xmm2,           xmm0
+    punpckhbw     xmm3,           xmm0
+    pmaddwd               xmm2,           xmm2
+    pmaddwd               xmm3,           xmm3
+    paddd         xmm2,           xmm3
+    movdqa                xmm3,           xmm2
+    psllq         xmm2,           32
+    psrlq         xmm3,           32
+    psllq         xmm3,           32
+    paddd         xmm2,           xmm3
+    paddd         sad_reg,        xmm2            ; sqsum
 
-  movdqa        xmm2,           [%6]
-  movdqa        xmm3,           xmm1
-  psadbw        xmm3,           xmm0
-  paddd sum_reg,                        xmm3    ; sum_cur
-  movdqa        xmm3,           xmm2
-  psadbw        xmm3,           xmm0
-  pslldq        xmm3,           4
-  paddd sum_reg,                        xmm3    ; sum_ref
+    movdqa        xmm2,           [%6]
+    movdqa        xmm3,           xmm1
+    psadbw        xmm3,           xmm0
+    paddd sum_reg,                        xmm3    ; sum_cur
+    movdqa        xmm3,           xmm2
+    psadbw        xmm3,           xmm0
+    pslldq        xmm3,           4
+    paddd sum_reg,                        xmm3    ; sum_ref
 
-  movdqa        xmm3,           xmm1
-  pmaxub        xmm3,           xmm2
-  pminub        xmm2,           xmm1
-  psubb xmm3,           xmm2    ; abs diff
-  pmaxub        mad_reg,        xmm3    ; max abs diff
+    movdqa        xmm3,           xmm1
+    pmaxub        xmm3,           xmm2
+    pminub        xmm2,           xmm1
+    psubb xmm3,           xmm2    ; abs diff
+    pmaxub        mad_reg,        xmm3    ; max abs diff
 
-  movdqa        xmm1,           xmm3
-  psadbw        xmm3,           xmm0
-  paddd sad_reg,        xmm3    ; sad
+    movdqa        xmm1,           xmm3
+    psadbw        xmm3,           xmm0
+    paddd sad_reg,        xmm3    ; sad
 
-  movdqa                xmm3,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm3,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm3,   xmm3
-  paddd         sqdiff_reg,     xmm1
-  paddd         sqdiff_reg,     xmm3    ; sqdiff
+    movdqa                xmm3,   xmm1
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm3,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm3,   xmm3
+    paddd         sqdiff_reg,     xmm1
+    paddd         sqdiff_reg,     xmm3    ; sqdiff
 
-  add           %5,     %7
-  add           %6,     %7
+    add           %5,     %7
+    add           %6,     %7
 %endmacro
 
 
@@ -249,99 +249,99 @@
 ;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
 WELS_EXTERN SampleVariance16x16_sse2
-  push esi
-  push edi
-  push ebx
+    push esi
+    push edi
+    push ebx
 
-  sub esp, 16
-  %define SUM                   [esp]
-  %define SUM_CUR               [esp+4]
-  %define SQR                   [esp+8]
-  %define SQR_CUR               [esp+12]
-  %define PUSH_SIZE     28      ; 12 + 16
+    sub esp, 16
+    %define SUM                   [esp]
+    %define SUM_CUR               [esp+4]
+    %define SQR                   [esp+8]
+    %define SQR_CUR               [esp+12]
+    %define PUSH_SIZE     28      ; 12 + 16
 
-  mov edi, [esp+PUSH_SIZE+4]    ; y_ref
-  mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
-  mov esi, [esp+PUSH_SIZE+12]   ; y_src
-  mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
-  mov ecx, 010h                         ; height = 16
+    mov edi, [esp+PUSH_SIZE+4]    ; y_ref
+    mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
+    mov esi, [esp+PUSH_SIZE+12]   ; y_src
+    mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
+    mov ecx, 010h                         ; height = 16
 
-  pxor xmm7, xmm7
-  movdqu SUM, xmm7
+    pxor xmm7, xmm7
+    movdqu SUM, xmm7
 
 .hloops:
-  movdqa xmm0, [edi]            ; y_ref
-  movdqa xmm1, [esi]            ; y_src
-  movdqa xmm2, xmm0             ; store first for future process
-  movdqa xmm3, xmm1
-  ; sum += diff;
-  movdqa xmm4, xmm0
-  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
-  ; to be continued for sum
-  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
-  paddw xmm4, xmm5
-  movd ebx, xmm4
-  add SUM, ebx
+    movdqa xmm0, [edi]            ; y_ref
+    movdqa xmm1, [esi]            ; y_src
+    movdqa xmm2, xmm0             ; store first for future process
+    movdqa xmm3, xmm1
+    ; sum += diff;
+    movdqa xmm4, xmm0
+    psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+    ; to be continued for sum
+    pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+    paddw xmm4, xmm5
+    movd ebx, xmm4
+    add SUM, ebx
 
-  ; sqr += diff * diff;
-  pmaxub xmm0, xmm1
-  pminub xmm1, xmm2
-  psubb xmm0, xmm1                              ; diff
-  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
-  movd ebx, xmm1
-  add SQR, ebx
+    ; sqr += diff * diff;
+    pmaxub xmm0, xmm1
+    pminub xmm1, xmm2
+    psubb xmm0, xmm1                              ; diff
+    SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+    movd ebx, xmm1
+    add SQR, ebx
 
-  ; sum_cur += y_src[x];
-  movdqa xmm0, xmm3             ; cur_orig
-  movdqa xmm1, xmm0
-  punpcklbw xmm0, xmm7
-  punpckhbw xmm1, xmm7
-  paddw xmm0, xmm1              ; 8x2
-  SUM_WORD_8x2_SSE2 xmm0, xmm1
-  movd ebx, xmm0
-  and ebx, 0ffffh
-  add SUM_CUR, ebx
+    ; sum_cur += y_src[x];
+    movdqa xmm0, xmm3             ; cur_orig
+    movdqa xmm1, xmm0
+    punpcklbw xmm0, xmm7
+    punpckhbw xmm1, xmm7
+    paddw xmm0, xmm1              ; 8x2
+    SUM_WORD_8x2_SSE2 xmm0, xmm1
+    movd ebx, xmm0
+    and ebx, 0ffffh
+    add SUM_CUR, ebx
 
-  ; sqr_cur += y_src[x] * y_src[x];
-  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
-  movd ebx, xmm0
-  add SQR_CUR, ebx
+    ; sqr_cur += y_src[x] * y_src[x];
+    SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+    movd ebx, xmm0
+    add SQR_CUR, ebx
 
-  lea edi, [edi+edx]
-  lea esi, [esi+eax]
-  dec ecx
-  jnz near .hloops
+    lea edi, [edi+edx]
+    lea esi, [esi+eax]
+    dec ecx
+    jnz near .hloops
 
-  mov ebx, 0
-  mov bx, word SUM
-  sar ebx, 8
-  imul ebx, ebx
-  mov ecx, SQR
-  sar ecx, 8
-  sub ecx, ebx
-  mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
-  mov [edi], cx                         ; to store uiMotionIndex
-  mov ebx, 0
-  mov bx, word SUM_CUR
-  sar ebx, 8
-  imul ebx, ebx
-  mov ecx, SQR_CUR
-  sar ecx, 8
-  sub ecx, ebx
-  mov [edi+2], cx                               ; to store uiTextureIndex
+    mov ebx, 0
+    mov bx, word SUM
+    sar ebx, 8
+    imul ebx, ebx
+    mov ecx, SQR
+    sar ecx, 8
+    sub ecx, ebx
+    mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
+    mov [edi], cx                         ; to store uiMotionIndex
+    mov ebx, 0
+    mov bx, word SUM_CUR
+    sar ebx, 8
+    imul ebx, ebx
+    mov ecx, SQR_CUR
+    sar ecx, 8
+    sub ecx, ebx
+    mov [edi+2], cx                               ; to store uiTextureIndex
 
-  %undef SUM
-  %undef SUM_CUR
-  %undef SQR
-  %undef SQR_CUR
-  %undef PUSH_SIZE
+    %undef SUM
+    %undef SUM_CUR
+    %undef SQR
+    %undef SQR_CUR
+    %undef PUSH_SIZE
 
-  add esp, 16
-  pop ebx
-  pop edi
-  pop esi
+    add esp, 16
+    pop ebx
+    pop edi
+    pop esi
 
-  ret
+    ret
 
 
 
@@ -360,67 +360,67 @@
 %define         psadframe                       esp + pushsize + 24
 %define         psad8x8                         esp + pushsize + 28
 %define         pushsize        12
-  push  esi
-  push  edi
-  push  ebx
-  mov           esi,    [cur_data]
-  mov           edi,    [ref_data]
-  mov           ebx,    [iPicStride]
-  mov           edx,    [psad8x8]
-  mov           eax,    ebx
+    push  esi
+    push  edi
+    push  ebx
+    mov           esi,    [cur_data]
+    mov           edi,    [ref_data]
+    mov           ebx,    [iPicStride]
+    mov           edx,    [psad8x8]
+    mov           eax,    ebx
 
-  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl           eax,    4                                                               ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl           eax,    4                                                               ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 height_loop:
-  mov           ecx,    dword [iPicWidth]
-  push  esi
-  push  edi
+    mov           ecx,    dword [iPicWidth]
+    push  esi
+    push  edi
 width_loop:
-  pxor  xmm6,   xmm6            ;
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx],          xmm6
-  psrldq        xmm6,           8
-  movd  [edx+4],        xmm6
+    pxor  xmm6,   xmm6            ;
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx],          xmm6
+    psrldq        xmm6,           8
+    movd  [edx+4],        xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx+8],        xmm6
-  psrldq        xmm6,           8
-  movd  [edx+12],       xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx+8],        xmm6
+    psrldq        xmm6,           8
+    movd  [edx+12],       xmm6
 
-  add           edx,    16
-  sub           esi,    eax
-  sub           edi,    eax
-  add           esi,    16
-  add           edi,    16
+    add           edx,    16
+    sub           esi,    eax
+    sub           edi,    eax
+    add           esi,    16
+    add           edi,    16
 
-  dec           ecx
-  jnz           width_loop
+    dec           ecx
+    jnz           width_loop
 
-  pop           edi
-  pop           esi
-  add           esi,    eax
-  add           edi,    eax
+    pop           edi
+    pop           esi
+    add           esi,    eax
+    add           edi,    eax
 
-  dec   dword [iPicHeight]
-  jnz           height_loop
+    dec   dword [iPicHeight]
+    jnz           height_loop
 
-  mov           edx,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [edx],  xmm7
+    mov           edx,    [psadframe]
+    movdqa        xmm5,   xmm7
+    psrldq        xmm7,   8
+    paddd xmm7,   xmm5
+    movd  [edx],  xmm7
 
 %undef          cur_data
 %undef          ref_data
@@ -430,10 +430,10 @@
 %undef          psadframe
 %undef          psad8x8
 %undef          pushsize
-  pop           ebx
-  pop           edi
-  pop           esi
-  ret
+    pop           ebx
+    pop           edi
+    pop           esi
+    ret
 
 %else  ;64-bit
 
@@ -441,98 +441,98 @@
 ;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
 WELS_EXTERN SampleVariance16x16_sse2
-  %define SUM                   r10;[esp]
-  %define SUM_CUR               r11;[esp+4]
-  %define SQR                   r13;[esp+8]
-  %define SQR_CUR               r15;[esp+12]
+    %define SUM                   r10;[esp]
+    %define SUM_CUR               r11;[esp+4]
+    %define SQR                   r13;[esp+8]
+    %define SQR_CUR               r15;[esp+12]
 
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  LOAD_5_PARA
-  PUSH_XMM 8
-  SIGN_EXTENSION r1,r1d
-  SIGN_EXTENSION r3,r3d
+    push r12
+    push r13
+    push r14
+    push r15
+    %assign push_num 4
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1,r1d
+    SIGN_EXTENSION r3,r3d
 
-  mov r12,010h
-  pxor xmm7, xmm7
-  movq SUM, xmm7
-  movq SUM_CUR,xmm7
-  movq SQR,xmm7
-  movq SQR_CUR,xmm7
+    mov r12,010h
+    pxor xmm7, xmm7
+    movq SUM, xmm7
+    movq SUM_CUR,xmm7
+    movq SQR,xmm7
+    movq SQR_CUR,xmm7
 
 .hloops:
-  mov r14,0
-  movdqa xmm0, [r0]             ; y_ref
-  movdqa xmm1, [r2]             ; y_src
-  movdqa xmm2, xmm0             ; store first for future process
-  movdqa xmm3, xmm1
-  ; sum += diff;
-  movdqa xmm4, xmm0
-  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
-  ; to be continued for sum
-  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
-  paddw xmm4, xmm5
-  movd r14d, xmm4
-  add SUM, r14
+    mov r14,0
+    movdqa xmm0, [r0]             ; y_ref
+    movdqa xmm1, [r2]             ; y_src
+    movdqa xmm2, xmm0             ; store first for future process
+    movdqa xmm3, xmm1
+    ; sum += diff;
+    movdqa xmm4, xmm0
+    psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+    ; to be continued for sum
+    pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+    paddw xmm4, xmm5
+    movd r14d, xmm4
+    add SUM, r14
 
-  ; sqr += diff * diff;
-  pmaxub xmm0, xmm1
-  pminub xmm1, xmm2
-  psubb xmm0, xmm1                              ; diff
-  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
-  movd r14d, xmm1
-  add SQR, r14
+    ; sqr += diff * diff;
+    pmaxub xmm0, xmm1
+    pminub xmm1, xmm2
+    psubb xmm0, xmm1                              ; diff
+    SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+    movd r14d, xmm1
+    add SQR, r14
 
-  ; sum_cur += y_src[x];
-  movdqa xmm0, xmm3             ; cur_orig
-  movdqa xmm1, xmm0
-  punpcklbw xmm0, xmm7
-  punpckhbw xmm1, xmm7
-  paddw xmm0, xmm1              ; 8x2
-  SUM_WORD_8x2_SSE2 xmm0, xmm1
-  movd r14d, xmm0
-  and r14, 0ffffh
-  add SUM_CUR, r14
+    ; sum_cur += y_src[x];
+    movdqa xmm0, xmm3             ; cur_orig
+    movdqa xmm1, xmm0
+    punpcklbw xmm0, xmm7
+    punpckhbw xmm1, xmm7
+    paddw xmm0, xmm1              ; 8x2
+    SUM_WORD_8x2_SSE2 xmm0, xmm1
+    movd r14d, xmm0
+    and r14, 0ffffh
+    add SUM_CUR, r14
 
-  ; sqr_cur += y_src[x] * y_src[x];
-  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
-  movd r14d, xmm0
-  add SQR_CUR, r14
+    ; sqr_cur += y_src[x] * y_src[x];
+    SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+    movd r14d, xmm0
+    add SQR_CUR, r14
 
-  lea r0, [r0+r1]
-  lea r2, [r2+r3]
-  dec r12
-  jnz near .hloops
+    lea r0, [r0+r1]
+    lea r2, [r2+r3]
+    dec r12
+    jnz near .hloops
 
-  mov r0, SUM
-  sar r0, 8
-  imul r0, r0
-  mov r1, SQR
-  sar r1, 8
-  sub r1, r0
-  mov [r4], r1w                         ; to store uiMotionIndex
-  mov r0, SUM_CUR
-  sar r0, 8
-  imul r0, r0
-  mov r1, SQR_CUR
-  sar r1, 8
-  sub r1, r0
-  mov [r4+2], r1w                               ; to store uiTextureIndex
+    mov r0, SUM
+    sar r0, 8
+    imul r0, r0
+    mov r1, SQR
+    sar r1, 8
+    sub r1, r0
+    mov [r4], r1w                         ; to store uiMotionIndex
+    mov r0, SUM_CUR
+    sar r0, 8
+    imul r0, r0
+    mov r1, SQR_CUR
+    sar r1, 8
+    sub r1, r0
+    mov [r4+2], r1w                               ; to store uiTextureIndex
 
-  POP_XMM
-  LOAD_5_PARA_POP
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    LOAD_5_PARA_POP
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 
 
-  %assign push_num 0
+    %assign push_num 0
 
-  ret
+    ret
 
 
 ;*************************************************************************************************************
@@ -550,69 +550,69 @@
 %define         psadframe                       r5
 %define         psad8x8                         r6
 
-  push r12
-  push r13
-  %assign push_num 2
-  LOAD_7_PARA
-  PUSH_XMM 8
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    push r12
+    push r13
+    %assign push_num 2
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov   r12,r4
-  shr           r2,     4                                       ; iPicWidth/16
-  shr           r3,     4                                       ; iPicHeight/16
+    mov   r12,r4
+    shr           r2,     4                                       ; iPicWidth/16
+    shr           r3,     4                                       ; iPicHeight/16
 
-  shl           r12,    4                                                               ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shl           r12,    4                                                               ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 height_loop:
-  mov           r13,    r2
-  push  r0
-  push  r1
+    mov           r13,    r2
+    push  r0
+    push  r1
 width_loop:
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r6],           xmm6
-  psrldq        xmm6,           8
-  movd  [r6+4], xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    paddd xmm7,           xmm6
+    movd  [r6],           xmm6
+    psrldq        xmm6,           8
+    movd  [r6+4], xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r6+8], xmm6
-  psrldq        xmm6,           8
-  movd  [r6+12],        xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    paddd xmm7,           xmm6
+    movd  [r6+8], xmm6
+    psrldq        xmm6,           8
+    movd  [r6+12],        xmm6
 
-  add           r6,     16
-  sub           r0,     r12
-  sub           r1,     r12
-  add           r0,     16
-  add           r1,     16
+    add           r6,     16
+    sub           r0,     r12
+    sub           r1,     r12
+    add           r0,     16
+    add           r1,     16
 
-  dec           r13
-  jnz           width_loop
+    dec           r13
+    jnz           width_loop
 
-  pop           r1
-  pop           r0
-  add           r0,     r12
-  add           r1,     r12
+    pop           r1
+    pop           r0
+    add           r0,     r12
+    add           r1,     r12
 
-  dec   r3
-  jnz           height_loop
+    dec   r3
+    jnz           height_loop
 
-  ;mov          r13,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [psadframe],    xmm7
+    ;mov          r13,    [psadframe]
+    movdqa        xmm5,   xmm7
+    psrldq        xmm7,   8
+    paddd xmm7,   xmm5
+    movd  [psadframe],    xmm7
 
 %undef          cur_data
 %undef          ref_data
@@ -622,12 +622,12 @@
 %undef          psadframe
 %undef          psad8x8
 %undef          pushsize
-  POP_XMM
-  LOAD_7_PARA_POP
-  pop r13
-  pop r12
-  %assign push_num 0
-  ret
+    POP_XMM
+    LOAD_7_PARA_POP
+    pop r13
+    pop r12
+    %assign push_num 0
+    ret
 
 %endif
 
@@ -653,103 +653,103 @@
 %define         tmp_esi                         esp + 0
 %define         tmp_edi                         esp + 4
 %define         pushsize                16
-  push  ebp
-  push  esi
-  push  edi
-  push  ebx
-  sub           esp,    localsize
-  mov           esi,    [cur_data]
-  mov           edi,    [ref_data]
-  mov           ebx,    [iPicStride]
-  mov           edx,    [psad8x8]
-  mov           eax,    ebx
+    push  ebp
+    push  esi
+    push  edi
+    push  ebx
+    sub           esp,    localsize
+    mov           esi,    [cur_data]
+    mov           edi,    [ref_data]
+    mov           ebx,    [iPicStride]
+    mov           edx,    [psad8x8]
+    mov           eax,    ebx
 
-  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl           eax,    4                                                       ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl           eax,    4                                                       ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 var_height_loop:
-  mov           ecx,    dword [iPicWidth]
-  mov           [tmp_esi],      esi
-  mov           [tmp_edi],      edi
+    mov           ecx,    dword [iPicWidth]
+    mov           [tmp_esi],      esi
+    mov           [tmp_edi],      edi
 var_width_loop:
-  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
-  pxor  xmm5,   xmm5            ; pSum16x16
-  pxor  xmm4,   xmm4            ; sqsum_16x16
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx],          xmm6
-  psrldq        xmm6,           8
-  movd  [edx+4],        xmm6
+    pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+    pxor  xmm5,   xmm5            ; pSum16x16
+    pxor  xmm4,   xmm4            ; sqsum_16x16
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx],          xmm6
+    psrldq        xmm6,           8
+    movd  [edx+4],        xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx+8],        xmm6
-  psrldq        xmm6,           8
-  movd  [edx+12],       xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx+8],        xmm6
+    psrldq        xmm6,           8
+    movd  [edx+12],       xmm6
 
-  mov           ebp,    [psum16x16]
-  movdqa        xmm1,   xmm5
-  psrldq        xmm1,   8
-  paddd xmm5,   xmm1
-  movd  [ebp],  xmm5
-  add           dword [psum16x16], 4
+    mov           ebp,    [psum16x16]
+    movdqa        xmm1,   xmm5
+    psrldq        xmm1,   8
+    paddd xmm5,   xmm1
+    movd  [ebp],  xmm5
+    add           dword [psum16x16], 4
 
-  movdqa        xmm5,   xmm4
-  psrldq        xmm5,   8
-  paddd xmm4,   xmm5
-  movdqa        xmm3,   xmm4
-  psrldq        xmm3,   4
-  paddd xmm4,   xmm3
+    movdqa        xmm5,   xmm4
+    psrldq        xmm5,   8
+    paddd xmm4,   xmm5
+    movdqa        xmm3,   xmm4
+    psrldq        xmm3,   4
+    paddd xmm4,   xmm3
 
-  mov           ebp,    [psqsum16x16]
-  movd  [ebp],  xmm4
-  add           dword [psqsum16x16], 4
+    mov           ebp,    [psqsum16x16]
+    movd  [ebp],  xmm4
+    add           dword [psqsum16x16], 4
 
-  add           edx,    16
-  sub           esi,    eax
-  sub           edi,    eax
-  add           esi,    16
-  add           edi,    16
+    add           edx,    16
+    sub           esi,    eax
+    sub           edi,    eax
+    add           esi,    16
+    add           edi,    16
 
-  dec           ecx
-  jnz           var_width_loop
+    dec           ecx
+    jnz           var_width_loop
 
-  mov           esi,    [tmp_esi]
-  mov           edi,    [tmp_edi]
-  add           esi,    eax
-  add           edi,    eax
+    mov           esi,    [tmp_esi]
+    mov           edi,    [tmp_edi]
+    add           esi,    eax
+    add           edi,    eax
 
-  dec   dword [iPicHeight]
-  jnz           var_height_loop
+    dec   dword [iPicHeight]
+    jnz           var_height_loop
 
-  mov           edx,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [edx],  xmm7
+    mov           edx,    [psadframe]
+    movdqa        xmm5,   xmm7
+    psrldq        xmm7,   8
+    paddd xmm7,   xmm5
+    movd  [edx],  xmm7
 
-  add           esp,    localsize
-  pop           ebx
-  pop           edi
-  pop           esi
-  pop           ebp
+    add           esp,    localsize
+    pop           ebx
+    pop           edi
+    pop           esi
+    pop           ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -763,7 +763,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 %else  ;64-bit
 
@@ -784,112 +784,112 @@
 %define         psum16x16                       arg8
 %define         psqsum16x16                 arg9
 
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  PUSH_XMM 8
+    push r12
+    push r13
+    push r14
+    push r15
+    %assign push_num 4
+    PUSH_XMM 8
 
 %ifdef WIN64
-  mov r4, arg5  ;iPicStride
-  mov r5, arg6  ;psad8x8
+    mov r4, arg5  ;iPicStride
+    mov r5, arg6  ;psad8x8
 %endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    mov r14,arg7
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov   r13,r4
-  shr   r2,4
-  shr   r3,4
+    mov   r13,r4
+    shr   r2,4
+    shr   r3,4
 
-  shl   r13,4   ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shl   r13,4   ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 var_height_loop:
-  push    r2
-  %assign push_num push_num+1
-  mov           r11,    r0
-  mov           r12,    r1
+    push    r2
+    %assign push_num push_num+1
+    mov           r11,    r0
+    mov           r12,    r1
 var_width_loop:
-  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
-  pxor  xmm5,   xmm5            ; pSum16x16
-  pxor  xmm4,   xmm4            ; sqsum_16x16
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r14],          xmm6
-  psrldq        xmm6,           8
-  movd  [r14+4],        xmm6
+    pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+    pxor  xmm5,   xmm5            ; pSum16x16
+    pxor  xmm4,   xmm4            ; sqsum_16x16
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    paddd xmm7,           xmm6
+    movd  [r14],          xmm6
+    psrldq        xmm6,           8
+    movd  [r14+4],        xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  paddd   xmm7,           xmm6
-  movd    [r14+8],        xmm6
-  psrldq  xmm6,           8
-  movd    [r14+12],       xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    paddd   xmm7,           xmm6
+    movd    [r14+8],        xmm6
+    psrldq  xmm6,           8
+    movd    [r14+12],       xmm6
 
-  mov             r15,    psum16x16
-  movdqa  xmm1,   xmm5
-  psrldq  xmm1,   8
-  paddd   xmm5,   xmm1
-  movd    [r15],  xmm5
-  add             dword psum16x16, 4
+    mov             r15,    psum16x16
+    movdqa  xmm1,   xmm5
+    psrldq  xmm1,   8
+    paddd   xmm5,   xmm1
+    movd    [r15],  xmm5
+    add             dword psum16x16, 4
 
-  movdqa  xmm5,   xmm4
-  psrldq  xmm5,   8
-  paddd   xmm4,   xmm5
-  movdqa  xmm3,   xmm4
-  psrldq  xmm3,   4
-  paddd   xmm4,   xmm3
+    movdqa  xmm5,   xmm4
+    psrldq  xmm5,   8
+    paddd   xmm4,   xmm5
+    movdqa  xmm3,   xmm4
+    psrldq  xmm3,   4
+    paddd   xmm4,   xmm3
 
-  mov             r15,    psqsum16x16
-  movd    [r15],  xmm4
-  add             dword psqsum16x16, 4
+    mov             r15,    psqsum16x16
+    movd    [r15],  xmm4
+    add             dword psqsum16x16, 4
 
-  add             r14,16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
+    add             r14,16
+    sub             r0,     r13
+    sub             r1,     r13
+    add             r0,     16
+    add             r1,     16
 
-  dec             r2
-  jnz             var_width_loop
+    dec             r2
+    jnz             var_width_loop
 
-  pop     r2
-  %assign push_num push_num-1
-  mov             r0,     r11
-  mov             r1,     r12
-  add             r0,     r13
-  add             r1,     r13
-  dec     r3
-  jnz             var_height_loop
+    pop     r2
+    %assign push_num push_num-1
+    mov             r0,     r11
+    mov             r1,     r12
+    add             r0,     r13
+    add             r1,     r13
+    dec     r3
+    jnz             var_height_loop
 
-  mov             r15,    psadframe
-  movdqa  xmm5,   xmm7
-  psrldq  xmm7,   8
-  paddd   xmm7,   xmm5
-  movd    [r15],  xmm7
+    mov             r15,    psadframe
+    movdqa  xmm5,   xmm7
+    psrldq  xmm7,   8
+    paddd   xmm7,   xmm5
+    movd    [r15],  xmm7
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 %assign push_num 0
 %undef          cur_data
 %undef          ref_data
@@ -904,7 +904,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 %endif
 
@@ -932,118 +932,118 @@
 %define         tmp_edi                         esp + 4
 %define         tmp_sadframe            esp + 8
 %define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+    sub             esp,    localsize
 
-  mov             ecx,    [iPicWidth]
-  mov             ecx,    [iPicHeight]
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             edx,    [psad8x8]
-  mov             eax,    ebx
+    mov             ecx,    [iPicWidth]
+    mov             ecx,    [iPicHeight]
+    mov             esi,    [cur_data]
+    mov             edi,    [ref_data]
+    mov             ebx,    [iPicStride]
+    mov             edx,    [psad8x8]
+    mov             eax,    ebx
 
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  mov             ecx,    [iPicWidth]
-  mov             ecx,    [iPicHeight]
-  pxor    xmm0,   xmm0
-  movd    [tmp_sadframe], xmm0
+    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl             eax,    4                                                       ; iPicStride*16
+    mov             ecx,    [iPicWidth]
+    mov             ecx,    [iPicHeight]
+    pxor    xmm0,   xmm0
+    movd    [tmp_sadframe], xmm0
 sqdiff_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
+    mov             ecx,    dword [iPicWidth]
+    mov             [tmp_esi],      esi
+    mov             [tmp_edi],      edi
 sqdiff_width_loop:
-  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
-  pxor    xmm6,   xmm6            ; pSum16x16
-  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  movdqa  xmm1,           xmm7
-  movd    [edx],          xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [edx+4],        xmm7
-  movd    ebp,            xmm1
-  add             [tmp_sadframe], ebp
+    pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+    pxor    xmm6,   xmm6            ; pSum16x16
+    pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    movdqa  xmm1,           xmm7
+    movd    [edx],          xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [edx+4],        xmm7
+    movd    ebp,            xmm1
+    add             [tmp_sadframe], ebp
 
-  pxor    xmm7,   xmm7
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  movdqa  xmm1,           xmm7
-  movd    [edx+8],        xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [edx+12],       xmm7
-  movd    ebp,            xmm1
-  add             [tmp_sadframe], ebp
+    pxor    xmm7,   xmm7
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    movdqa  xmm1,           xmm7
+    movd    [edx+8],        xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [edx+12],       xmm7
+    movd    ebp,            xmm1
+    add             [tmp_sadframe], ebp
 
-  mov             ebp,    [psum16x16]
-  movdqa  xmm1,   xmm6
-  psrldq  xmm1,   8
-  paddd   xmm6,   xmm1
-  movd    [ebp],  xmm6
-  add             dword [psum16x16], 4
+    mov             ebp,    [psum16x16]
+    movdqa  xmm1,   xmm6
+    psrldq  xmm1,   8
+    paddd   xmm6,   xmm1
+    movd    [ebp],  xmm6
+    add             dword [psum16x16], 4
 
-  mov             ebp,    [psqsum16x16]
-  pshufd  xmm6,   xmm5,   14 ;00001110
-  paddd   xmm6,   xmm5
-  pshufd  xmm5,   xmm6,   1  ;00000001
-  paddd   xmm5,   xmm6
-  movd    [ebp],  xmm5
-  add             dword [psqsum16x16], 4
+    mov             ebp,    [psqsum16x16]
+    pshufd  xmm6,   xmm5,   14 ;00001110
+    paddd   xmm6,   xmm5
+    pshufd  xmm5,   xmm6,   1  ;00000001
+    paddd   xmm5,   xmm6
+    movd    [ebp],  xmm5
+    add             dword [psqsum16x16], 4
 
-  mov             ebp,    [psqdiff16x16]
-  pshufd  xmm5,   xmm4,   14      ; 00001110
-  paddd   xmm5,   xmm4
-  pshufd  xmm4,   xmm5,   1       ; 00000001
-  paddd   xmm4,   xmm5
-  movd    [ebp],  xmm4
-  add             dword   [psqdiff16x16], 4
+    mov             ebp,    [psqdiff16x16]
+    pshufd  xmm5,   xmm4,   14      ; 00001110
+    paddd   xmm5,   xmm4
+    pshufd  xmm4,   xmm5,   1       ; 00000001
+    paddd   xmm4,   xmm5
+    movd    [ebp],  xmm4
+    add             dword   [psqdiff16x16], 4
 
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
+    add             edx,    16
+    sub             esi,    eax
+    sub             edi,    eax
+    add             esi,    16
+    add             edi,    16
 
-  dec             ecx
-  jnz             sqdiff_width_loop
+    dec             ecx
+    jnz             sqdiff_width_loop
 
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
+    mov             esi,    [tmp_esi]
+    mov             edi,    [tmp_edi]
+    add             esi,    eax
+    add             edi,    eax
 
-  dec     dword [iPicHeight]
-  jnz             sqdiff_height_loop
+    dec     dword [iPicHeight]
+    jnz             sqdiff_height_loop
 
-  mov             ebx,    [tmp_sadframe]
-  mov             eax,    [psadframe]
-  mov             [eax],  ebx
+    mov             ebx,    [tmp_sadframe]
+    mov             eax,    [psadframe]
+    mov             [eax],  ebx
 
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
+    add             esp,    localsize
+    pop             ebx
+    pop             edi
+    pop             esi
+    pop             ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -1059,7 +1059,7 @@
 %undef          tmp_sadframe
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 %else
 
@@ -1083,128 +1083,128 @@
 %define         psqsum16x16                     arg9;
 %define         psqdiff16x16                    arg10
 
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  PUSH_XMM 10
+    push r12
+    push r13
+    push r14
+    push r15
+    %assign push_num 4
+    PUSH_XMM 10
 
 %ifdef WIN64
-  mov r4,arg5
+    mov r4,arg5
 %endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    mov r14,arg7
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov        r13,r4
-  shr     r2,4   ; iPicWidth/16
-  shr     r3,4   ; iPicHeight/16
-  shl     r13,4   ; iPicStride*16
-  pxor    xmm0,   xmm0
-  pxor  xmm8, xmm8  ;framesad
-  pxor  xmm9, xmm9
+    mov        r13,r4
+    shr     r2,4   ; iPicWidth/16
+    shr     r3,4   ; iPicHeight/16
+    shl     r13,4   ; iPicStride*16
+    pxor    xmm0,   xmm0
+    pxor  xmm8, xmm8  ;framesad
+    pxor  xmm9, xmm9
 sqdiff_height_loop:
-  ;mov            ecx,    dword [iPicWidth]
-  ;mov      r14,r2
-  push r2
-  %assign push_num push_num +1
-  mov             r10,    r0
-  mov             r11,    r1
+    ;mov            ecx,    dword [iPicWidth]
+    ;mov      r14,r2
+    push r2
+    %assign push_num push_num +1
+    mov             r10,    r0
+    mov             r11,    r1
 sqdiff_width_loop:
-  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
-  pxor    xmm6,   xmm6            ; pSum16x16
-  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  movdqa  xmm1,           xmm7
-  movd    [r14],          xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [r14+4],        xmm7
-  movd    r15d,           xmm1
-  movd  xmm9, r15d
-  paddd xmm8,xmm9
+    pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+    pxor    xmm6,   xmm6            ; pSum16x16
+    pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    movdqa  xmm1,           xmm7
+    movd    [r14],          xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [r14+4],        xmm7
+    movd    r15d,           xmm1
+    movd  xmm9, r15d
+    paddd xmm8,xmm9
 
 
-  pxor    xmm7,   xmm7
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  movdqa  xmm1,           xmm7
-  movd    [r14+8],        xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [r14+12],       xmm7
-  movd    r15d,           xmm1
-  movd  xmm9, r15d
-  paddd xmm8,xmm9
+    pxor    xmm7,   xmm7
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    movdqa  xmm1,           xmm7
+    movd    [r14+8],        xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [r14+12],       xmm7
+    movd    r15d,           xmm1
+    movd  xmm9, r15d
+    paddd xmm8,xmm9
 
-  mov             r15,    psum16x16
-  movdqa  xmm1,   xmm6
-  psrldq  xmm1,   8
-  paddd   xmm6,   xmm1
-  movd    [r15],  xmm6
-  add             dword psum16x16, 4
+    mov             r15,    psum16x16
+    movdqa  xmm1,   xmm6
+    psrldq  xmm1,   8
+    paddd   xmm6,   xmm1
+    movd    [r15],  xmm6
+    add             dword psum16x16, 4
 
-  mov             r15,    psqsum16x16
-  pshufd  xmm6,   xmm5,   14 ;00001110
-  paddd   xmm6,   xmm5
-  pshufd  xmm5,   xmm6,   1  ;00000001
-  paddd   xmm5,   xmm6
-  movd    [r15],  xmm5
-  add             dword psqsum16x16, 4
+    mov             r15,    psqsum16x16
+    pshufd  xmm6,   xmm5,   14 ;00001110
+    paddd   xmm6,   xmm5
+    pshufd  xmm5,   xmm6,   1  ;00000001
+    paddd   xmm5,   xmm6
+    movd    [r15],  xmm5
+    add             dword psqsum16x16, 4
 
-  mov             r15,    psqdiff16x16
-  pshufd  xmm5,   xmm4,   14      ; 00001110
-  paddd   xmm5,   xmm4
-  pshufd  xmm4,   xmm5,   1       ; 00000001
-  paddd   xmm4,   xmm5
-  movd    [r15],  xmm4
-  add             dword   psqdiff16x16,   4
+    mov             r15,    psqdiff16x16
+    pshufd  xmm5,   xmm4,   14      ; 00001110
+    paddd   xmm5,   xmm4
+    pshufd  xmm4,   xmm5,   1       ; 00000001
+    paddd   xmm4,   xmm5
+    movd    [r15],  xmm4
+    add             dword   psqdiff16x16,   4
 
-  add             r14,16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
+    add             r14,16
+    sub             r0,     r13
+    sub             r1,     r13
+    add             r0,     16
+    add             r1,     16
 
-  dec             r2
-  jnz             sqdiff_width_loop
+    dec             r2
+    jnz             sqdiff_width_loop
 
-  pop r2
-  %assign push_num push_num -1
+    pop r2
+    %assign push_num push_num -1
 
-  mov             r0,     r10
-  mov             r1,     r11
-  add             r0,     r13
-  add             r1,     r13
+    mov             r0,     r10
+    mov             r1,     r11
+    add             r0,     r13
+    add             r1,     r13
 
-  dec     r3
-  jnz             sqdiff_height_loop
+    dec     r3
+    jnz             sqdiff_height_loop
 
-  mov             r13,    psadframe
-  movd    [r13],  xmm8
+    mov             r13,    psadframe
+    movd    [r13],  xmm8
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-  %assign push_num 0
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    %assign push_num 0
 
 %undef          cur_data
 %undef          ref_data
@@ -1221,7 +1221,7 @@
 %undef          tmp_sadframe
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 
 
@@ -1249,145 +1249,145 @@
 %define         tmp_edi                         esp + 4
 %define         tmp_ecx                         esp + 8
 %define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             eax,    ebx
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+    sub             esp,    localsize
+    mov             esi,    [cur_data]
+    mov             edi,    [ref_data]
+    mov             ebx,    [iPicStride]
+    mov             eax,    ebx
 
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  xor             ebp,    ebp
-  pxor    xmm0,   xmm0
+    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl             eax,    4                                                       ; iPicStride*16
+    xor             ebp,    ebp
+    pxor    xmm0,   xmm0
 bgd_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
+    mov             ecx,    dword [iPicWidth]
+    mov             [tmp_esi],      esi
+    mov             [tmp_edi],      edi
 bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8
-  pxor    xmm6,   xmm6            ; sum_cur_8x8
-  pxor    xmm5,   xmm5            ; sum_ref_8x8
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    pxor    xmm7,   xmm7            ; pSad8x8
+    pxor    xmm6,   xmm6            ; sum_cur_8x8
+    pxor    xmm5,   xmm5            ; sum_ref_8x8
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
 
 
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
+    mov                     edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm4
 
-  ;movdqa         xmm1,   xmm4
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm4,   xmm0
-  ;punpcklwd      xmm4,   xmm0
-  ;movd           [edx+4],        xmm4
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  mov                     [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm4
-  movd            ecx,    xmm4
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    ;movdqa         xmm1,   xmm4
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm4,   xmm0
+    ;punpcklwd      xmm4,   xmm0
+    ;movd           [edx+4],        xmm4
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    mov                     [tmp_ecx],      ecx
+    movhlps         xmm1,   xmm4
+    movd            ecx,    xmm4
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
 
-  pslldq          xmm7,   4
-  pslldq          xmm6,   4
-  pslldq          xmm5,   4
+    pslldq          xmm7,   4
+    pslldq          xmm6,   4
+    pslldq          xmm5,   4
 
 
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
 
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
+    mov                     edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm4
 
-  ;movdqa         xmm1,   xmm4
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm4,   xmm0
-  ;punpcklwd      xmm4,   xmm0
-  ;movd           [edx+4],        xmm4
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  movhlps         xmm1,   xmm4
-  movd            ecx,    xmm4
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    ;movdqa         xmm1,   xmm4
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm4,   xmm0
+    ;punpcklwd      xmm4,   xmm0
+    ;movd           [edx+4],        xmm4
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    movhlps         xmm1,   xmm4
+    movd            ecx,    xmm4
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
-  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+    ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
 
-  mov             edx,    [psad8x8]
-  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
-  movdqa  [edx],  xmm1
-  add             edx,    16
-  mov             [psad8x8],      edx                                     ; sad8x8
+    mov             edx,    [psad8x8]
+    pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+    movdqa  [edx],  xmm1
+    add             edx,    16
+    mov             [psad8x8],      edx                                     ; sad8x8
 
-  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
-  pshufd  xmm2,   xmm1,   00000011b
-  paddd   xmm1,   xmm2
-  movd    edx,    xmm1
-  add             ebp,    edx                                             ; sad frame
+    paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+    pshufd  xmm2,   xmm1,   00000011b
+    paddd   xmm1,   xmm2
+    movd    edx,    xmm1
+    add             ebp,    edx                                             ; sad frame
 
-  mov             edx,    [p_sd8x8]
-  psubd   xmm6,   xmm5
-  pshufd  xmm1,   xmm6,   10001101b
-  movdqa  [edx],  xmm1
-  add             edx,    16
-  mov             [p_sd8x8],      edx
+    mov             edx,    [p_sd8x8]
+    psubd   xmm6,   xmm5
+    pshufd  xmm1,   xmm6,   10001101b
+    movdqa  [edx],  xmm1
+    add             edx,    16
+    mov             [p_sd8x8],      edx
 
 
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
+    add             edx,    16
+    sub             esi,    eax
+    sub             edi,    eax
+    add             esi,    16
+    add             edi,    16
 
-  mov             ecx,    [tmp_ecx]
-  dec             ecx
-  jnz             bgd_width_loop
+    mov             ecx,    [tmp_ecx]
+    dec             ecx
+    jnz             bgd_width_loop
 
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
+    mov             esi,    [tmp_esi]
+    mov             edi,    [tmp_edi]
+    add             esi,    eax
+    add             edi,    eax
 
-  dec             dword [iPicHeight]
-  jnz             bgd_height_loop
+    dec             dword [iPicHeight]
+    jnz             bgd_height_loop
 
-  mov             edx,    [psadframe]
-  mov             [edx],  ebp
+    mov             edx,    [psadframe]
+    mov             [edx],  ebp
 
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
+    add             esp,    localsize
+    pop             ebx
+    pop             edi
+    pop             esi
+    pop             ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -1401,7 +1401,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 
 
@@ -1431,190 +1431,190 @@
 %define         tmp_sadframe            esp + 8
 %define         tmp_ecx                         esp + 12
 %define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             eax,    ebx
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+    sub             esp,    localsize
+    mov             esi,    [cur_data]
+    mov             edi,    [ref_data]
+    mov             ebx,    [iPicStride]
+    mov             eax,    ebx
 
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  pxor    xmm0,   xmm0
-  movd    [tmp_sadframe], xmm0
+    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl             eax,    4                                                       ; iPicStride*16
+    pxor    xmm0,   xmm0
+    movd    [tmp_sadframe], xmm0
 sqdiff_bgd_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
+    mov             ecx,    dword [iPicWidth]
+    mov             [tmp_esi],      esi
+    mov             [tmp_edi],      edi
 sqdiff_bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
 
-  mov             edx,            [psad8x8]
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [edx],          xmm2
-  movd    [edx+4],        xmm1
-  add             edx,            8
-  mov             [psad8x8],      edx                     ; sad8x8
+    mov             edx,            [psad8x8]
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [edx],          xmm2
+    movd    [edx+4],        xmm1
+    add             edx,            8
+    mov             [psad8x8],      edx                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    edx,                            xmm1
-  add             [tmp_sadframe],         edx                     ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    edx,                            xmm1
+    add             [tmp_sadframe],         edx                     ; iFrameSad
 
-  mov             edx,            [psum16x16]
-  movdqa  xmm1,           xmm6
-  pshufd  xmm2,           xmm1,           00001110b
-  paddd   xmm1,           xmm2
-  movd    [edx],          xmm1                            ; sum
+    mov             edx,            [psum16x16]
+    movdqa  xmm1,           xmm6
+    pshufd  xmm2,           xmm1,           00001110b
+    paddd   xmm1,           xmm2
+    movd    [edx],          xmm1                            ; sum
 
-  mov             edx,            [p_sd8x8]
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [edx],          xmm1
-  add             edx,            8
-  mov             [p_sd8x8],      edx
+    mov             edx,            [p_sd8x8]
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [edx],          xmm1
+    add             edx,            8
+    mov             [p_sd8x8],      edx
 
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm5
-  ;movdqa         xmm1,   xmm5
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm5,   xmm0
-  ;punpcklwd      xmm5,   xmm0
-  ;movd           [edx+4],        xmm5
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  mov                     [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm5
-  movd            ecx,    xmm5
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    mov                     edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm5
+    ;movdqa         xmm1,   xmm5
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm5,   xmm0
+    ;punpcklwd      xmm5,   xmm0
+    ;movd           [edx+4],        xmm5
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    mov                     [tmp_ecx],      ecx
+    movhlps         xmm1,   xmm5
+    movd            ecx,    xmm5
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
-  psrlq   xmm7,   32
-  psllq   xmm7,   32                      ; clear sad
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    psrlq   xmm7,   32
+    psllq   xmm7,   32                      ; clear sad
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
 
-  mov             edx,            [psad8x8]
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [edx],          xmm2
-  movd    [edx+4],        xmm1
-  add             edx,            8
-  mov             [psad8x8],      edx                     ; sad8x8
+    mov             edx,            [psad8x8]
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [edx],          xmm2
+    movd    [edx+4],        xmm1
+    add             edx,            8
+    mov             [psad8x8],      edx                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    edx,                            xmm1
-  add             [tmp_sadframe],         edx                     ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    edx,                            xmm1
+    add             [tmp_sadframe],         edx                     ; iFrameSad
 
-  mov             edx,                    [psum16x16]
-  movdqa  xmm1,                   xmm6
-  pshufd  xmm2,                   xmm1,           00001110b
-  paddd   xmm1,                   xmm2
-  movd    ebp,                    xmm1                            ; sum
-  add             [edx],                  ebp
-  add             edx,                    4
-  mov             [psum16x16],    edx
+    mov             edx,                    [psum16x16]
+    movdqa  xmm1,                   xmm6
+    pshufd  xmm2,                   xmm1,           00001110b
+    paddd   xmm1,                   xmm2
+    movd    ebp,                    xmm1                            ; sum
+    add             [edx],                  ebp
+    add             edx,                    4
+    mov             [psum16x16],    edx
 
-  mov             edx,                    [psqsum16x16]
-  psrlq   xmm7,                   32
-  pshufd  xmm2,                   xmm7,           00001110b
-  paddd   xmm2,                   xmm7
-  movd    [edx],                  xmm2                            ; sqsum
-  add             edx,                    4
-  mov             [psqsum16x16],  edx
+    mov             edx,                    [psqsum16x16]
+    psrlq   xmm7,                   32
+    pshufd  xmm2,                   xmm7,           00001110b
+    paddd   xmm2,                   xmm7
+    movd    [edx],                  xmm2                            ; sqsum
+    add             edx,                    4
+    mov             [psqsum16x16],  edx
 
-  mov             edx,            [p_sd8x8]
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [edx],          xmm1
-  add             edx,            8
-  mov             [p_sd8x8],      edx
+    mov             edx,            [p_sd8x8]
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [edx],          xmm1
+    add             edx,            8
+    mov             [p_sd8x8],      edx
 
-  mov             edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm5
-  ;movdqa         xmm1,   xmm5
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm5,   xmm0
-  ;punpcklwd      xmm5,   xmm0
-  ;movd           [edx+4],        xmm5
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  movhlps         xmm1,   xmm5
-  movd            ecx,    xmm5
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    mov             edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm5
+    ;movdqa         xmm1,   xmm5
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm5,   xmm0
+    ;punpcklwd      xmm5,   xmm0
+    ;movd           [edx+4],        xmm5
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    movhlps         xmm1,   xmm5
+    movd            ecx,    xmm5
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
-  mov             edx,            [psqdiff16x16]
-  pshufd  xmm1,           xmm4,           00001110b
-  paddd   xmm4,           xmm1
-  pshufd  xmm1,           xmm4,           00000001b
-  paddd   xmm4,           xmm1
-  movd    [edx],          xmm4
-  add             edx,            4
-  mov             [psqdiff16x16], edx
+    mov             edx,            [psqdiff16x16]
+    pshufd  xmm1,           xmm4,           00001110b
+    paddd   xmm4,           xmm1
+    pshufd  xmm1,           xmm4,           00000001b
+    paddd   xmm4,           xmm1
+    movd    [edx],          xmm4
+    add             edx,            4
+    mov             [psqdiff16x16], edx
 
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
+    add             edx,    16
+    sub             esi,    eax
+    sub             edi,    eax
+    add             esi,    16
+    add             edi,    16
 
-  mov             ecx,    [tmp_ecx]
-  dec             ecx
-  jnz             sqdiff_bgd_width_loop
+    mov             ecx,    [tmp_ecx]
+    dec             ecx
+    jnz             sqdiff_bgd_width_loop
 
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
+    mov             esi,    [tmp_esi]
+    mov             edi,    [tmp_edi]
+    add             esi,    eax
+    add             edi,    eax
 
-  dec     dword [iPicHeight]
-  jnz             sqdiff_bgd_height_loop
+    dec     dword [iPicHeight]
+    jnz             sqdiff_bgd_height_loop
 
-  mov             edx,    [psadframe]
-  mov             ebp,    [tmp_sadframe]
-  mov             [edx],  ebp
+    mov             edx,    [psadframe]
+    mov             ebp,    [tmp_sadframe]
+    mov             [edx],  ebp
 
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
+    add             esp,    localsize
+    pop             ebx
+    pop             edi
+    pop             esi
+    pop             ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -1631,7 +1631,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-   ret
+    ret
 %else
 
 ;*************************************************************************************************************
@@ -1651,142 +1651,142 @@
 %define         p_sd8x8                         arg8;
 %define         p_mad8x8                        arg9;
 
-  push r12
-  push r13
-  push r14
-  push r15
+    push r12
+    push r13
+    push r14
+    push r15
 %assign push_num 4
-  PUSH_XMM 10
+    PUSH_XMM 10
 %ifdef WIN64
-  mov r4,arg5
-  ;  mov r5,arg6
+    mov r4,arg5
+    ;  mov r5,arg6
 %endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    mov r14,arg7
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
 
-  mov     r13,r4
-  mov     r15,r0
-  shr     r2,4
-  shr     r3,4
-  shl     r13,4
-  pxor    xmm0,   xmm0
-  pxor    xmm8,   xmm8
-  pxor    xmm9,   xmm9
+    mov     r13,r4
+    mov     r15,r0
+    shr     r2,4
+    shr     r3,4
+    shl     r13,4
+    pxor    xmm0,   xmm0
+    pxor    xmm8,   xmm8
+    pxor    xmm9,   xmm9
 bgd_height_loop:
-  ;mov            ecx,    dword [iPicWidth]
-  push r2
-  %assign push_num push_num+1
-  mov             r10,    r15
-  mov             r11,    r1
+    ;mov            ecx,    dword [iPicWidth]
+    push r2
+    %assign push_num push_num+1
+    mov             r10,    r15
+    mov             r11,    r1
 bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8
-  pxor    xmm6,   xmm6            ; sum_cur_8x8
-  pxor    xmm5,   xmm5            ; sum_ref_8x8
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    pxor    xmm7,   xmm7            ; pSad8x8
+    pxor    xmm6,   xmm6            ; sum_cur_8x8
+    pxor    xmm5,   xmm5            ; sum_ref_8x8
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
 
 
-  mov                     r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm4
+    mov                     r14,            p_mad8x8
+    WELS_MAX_REG_SSE2       xmm4
 
-  ;mov                    [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm4
-  movd            r0d,    xmm4
+    ;mov                    [tmp_ecx],      ecx
+    movhlps         xmm1,   xmm4
+    movd            r0d,    xmm4
 
 
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  add                     r14,    2
-  ;mov                     p_mad8x8,       r14
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    add                     r14,    2
+    ;mov                     p_mad8x8,       r14
 
 
-  pslldq          xmm7,   4
-  pslldq          xmm6,   4
-  pslldq          xmm5,   4
+    pslldq          xmm7,   4
+    pslldq          xmm6,   4
+    pslldq          xmm5,   4
 
 
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
 
-  ;mov                     r14,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
+    ;mov                     r14,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm4
 
-  movhlps         xmm1,   xmm4
-  movd            r0d,    xmm4
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
+    movhlps         xmm1,   xmm4
+    movd            r0d,    xmm4
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    add                     r14,    2
+    mov                     p_mad8x8,       r14
 
-  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+    ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
 
-  mov             r14,    psad8x8
-  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
-  movdqa  [r14],  xmm1
-  add             r14,    16
-  mov             psad8x8,        r14                                     ; sad8x8
+    mov             r14,    psad8x8
+    pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+    movdqa  [r14],  xmm1
+    add             r14,    16
+    mov             psad8x8,        r14                                     ; sad8x8
 
-  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
-  pshufd  xmm2,   xmm1,   00000011b
-  paddd   xmm1,   xmm2
-  movd    r14d,   xmm1
-  movd    xmm9, r14d
-  paddd   xmm8,   xmm9                                            ; sad frame
+    paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+    pshufd  xmm2,   xmm1,   00000011b
+    paddd   xmm1,   xmm2
+    movd    r14d,   xmm1
+    movd    xmm9, r14d
+    paddd   xmm8,   xmm9                                            ; sad frame
 
-  mov             r14,    p_sd8x8
-  psubd   xmm6,   xmm5
-  pshufd  xmm1,   xmm6,   10001101b
-  movdqa  [r14],  xmm1
-  add             r14,    16
-  mov             p_sd8x8,        r14
+    mov             r14,    p_sd8x8
+    psubd   xmm6,   xmm5
+    pshufd  xmm1,   xmm6,   10001101b
+    movdqa  [r14],  xmm1
+    add             r14,    16
+    mov             p_sd8x8,        r14
 
 
-  ;add            edx,    16
-  sub             r15,    r13
-  sub             r1,     r13
-  add             r15,    16
-  add             r1,     16
+    ;add            edx,    16
+    sub             r15,    r13
+    sub             r1,     r13
+    add             r15,    16
+    add             r1,     16
 
 
-  dec             r2
-  jnz             bgd_width_loop
-  pop     r2
+    dec             r2
+    jnz             bgd_width_loop
+    pop     r2
 %assign push_num push_num-1
-  mov             r15,    r10
-  mov             r1,     r11
-  add             r15,    r13
-  add             r1,     r13
+    mov             r15,    r10
+    mov             r1,     r11
+    add             r15,    r13
+    add             r1,     r13
 
-  dec             r3
-  jnz             bgd_height_loop
+    dec             r3
+    jnz             bgd_height_loop
 
-  mov             r13,    psadframe
-  movd    [r13],  xmm8
+    mov             r13,    psadframe
+    movd    [r13],  xmm8
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 %assign push_num 0
 %undef          cur_data
 %undef          ref_data
@@ -1801,7 +1801,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 
 
@@ -1826,189 +1826,189 @@
 %define         p_sd8x8                         arg11
 %define         p_mad8x8                        arg12
 
-  push r12
-  push r13
-  push r14
-  push r15
+    push r12
+    push r13
+    push r14
+    push r15
 %assign push_num 4
-  PUSH_XMM 10
+    PUSH_XMM 10
 %ifdef WIN64
-  mov r4,arg5
-  ;mov r5,arg6
+    mov r4,arg5
+    ;mov r5,arg6
 %endif
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov     r13,r4
-  shr             r2,     4                                       ; iPicWidth/16
-  shr             r3,     4                                       ; iPicHeight/16
-  shl             r13,    4                                                       ; iPicStride*16
-  pxor    xmm0,   xmm0
-  pxor    xmm8,   xmm8
-  pxor    xmm9,   xmm9
+    mov     r13,r4
+    shr             r2,     4                                       ; iPicWidth/16
+    shr             r3,     4                                       ; iPicHeight/16
+    shl             r13,    4                                                       ; iPicStride*16
+    pxor    xmm0,   xmm0
+    pxor    xmm8,   xmm8
+    pxor    xmm9,   xmm9
 
 
 sqdiff_bgd_height_loop:
-  mov             r10,    r0
-  mov             r11,    r1
-  push r2
+    mov             r10,    r0
+    mov             r11,    r1
+    push r2
 %assign push_num push_num+1
 sqdiff_bgd_width_loop:
 
-  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
 
-  mov             r14,            psad8x8
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [r14],          xmm2
-  movd    [r14+4],        xmm1
-  add             r14,            8
-  mov             psad8x8,        r14                     ; sad8x8
+    mov             r14,            psad8x8
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [r14],          xmm2
+    movd    [r14+4],        xmm1
+    add             r14,            8
+    mov             psad8x8,        r14                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    r14d,                           xmm1
-  movd    xmm9,r14d
-  paddd           xmm8,           xmm9                    ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    r14d,                           xmm1
+    movd    xmm9,r14d
+    paddd           xmm8,           xmm9                    ; iFrameSad
 
-  mov             r14,            psum16x16
-  movdqa  xmm1,           xmm6
-  pshufd  xmm2,           xmm1,           00001110b
-  paddd   xmm1,           xmm2
-  movd    [r14],          xmm1                            ; sum
+    mov             r14,            psum16x16
+    movdqa  xmm1,           xmm6
+    pshufd  xmm2,           xmm1,           00001110b
+    paddd   xmm1,           xmm2
+    movd    [r14],          xmm1                            ; sum
 
-  mov             r14,            p_sd8x8
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [r14],          xmm1
-  add             r14,            8
-  mov             p_sd8x8,        r14
+    mov             r14,            p_sd8x8
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [r14],          xmm1
+    add             r14,            8
+    mov             p_sd8x8,        r14
 
-  mov                     r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm5
+    mov                     r14,            p_mad8x8
+    WELS_MAX_REG_SSE2       xmm5
 
-  movhlps         xmm1,   xmm5
-  push r0
-  movd            r0d,    xmm5
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  pop r0
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
+    movhlps         xmm1,   xmm5
+    push r0
+    movd            r0d,    xmm5
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    pop r0
+    add                     r14,    2
+    mov                     p_mad8x8,       r14
 
-  psrlq   xmm7,   32
-  psllq   xmm7,   32                      ; clear sad
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    psrlq   xmm7,   32
+    psllq   xmm7,   32                      ; clear sad
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
 
-  mov             r14,            psad8x8
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [r14],          xmm2
-  movd    [r14+4],        xmm1
-  add             r14,            8
-  mov             psad8x8,        r14                     ; sad8x8
+    mov             r14,            psad8x8
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [r14],          xmm2
+    movd    [r14+4],        xmm1
+    add             r14,            8
+    mov             psad8x8,        r14                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    r14d,                           xmm1
-  movd    xmm9, r14d
-  paddd   xmm8,           xmm9            ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    r14d,                           xmm1
+    movd    xmm9, r14d
+    paddd   xmm8,           xmm9            ; iFrameSad
 
-  mov             r14,                    psum16x16
-  movdqa  xmm1,                   xmm6
-  pshufd  xmm2,                   xmm1,           00001110b
-  paddd   xmm1,                   xmm2
-  movd    r15d,                   xmm1                            ; sum
-  add             [r14],                  r15d
-  add             r14,                    4
-  mov             psum16x16,      r14
+    mov             r14,                    psum16x16
+    movdqa  xmm1,                   xmm6
+    pshufd  xmm2,                   xmm1,           00001110b
+    paddd   xmm1,                   xmm2
+    movd    r15d,                   xmm1                            ; sum
+    add             [r14],                  r15d
+    add             r14,                    4
+    mov             psum16x16,      r14
 
-  mov             r14,                    psqsum16x16
-  psrlq   xmm7,                   32
-  pshufd  xmm2,                   xmm7,           00001110b
-  paddd   xmm2,                   xmm7
-  movd    [r14],                  xmm2                            ; sqsum
-  add             r14,                    4
-  mov             psqsum16x16,    r14
+    mov             r14,                    psqsum16x16
+    psrlq   xmm7,                   32
+    pshufd  xmm2,                   xmm7,           00001110b
+    paddd   xmm2,                   xmm7
+    movd    [r14],                  xmm2                            ; sqsum
+    add             r14,                    4
+    mov             psqsum16x16,    r14
 
-  mov             r14,            p_sd8x8
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [r14],          xmm1
-  add             r14,            8
-  mov             p_sd8x8,        r14
+    mov             r14,            p_sd8x8
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [r14],          xmm1
+    add             r14,            8
+    mov             p_sd8x8,        r14
 
-  mov             r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm5
+    mov             r14,            p_mad8x8
+    WELS_MAX_REG_SSE2       xmm5
 
 
-  movhlps         xmm1,   xmm5
-  push r0
-  movd            r0d,    xmm5
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  pop r0
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
+    movhlps         xmm1,   xmm5
+    push r0
+    movd            r0d,    xmm5
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    pop r0
+    add                     r14,    2
+    mov                     p_mad8x8,       r14
 
-  mov             r14,            psqdiff16x16
-  pshufd  xmm1,           xmm4,           00001110b
-  paddd   xmm4,           xmm1
-  pshufd  xmm1,           xmm4,           00000001b
-  paddd   xmm4,           xmm1
-  movd    [r14],          xmm4
-  add             r14,            4
-  mov             psqdiff16x16,   r14
+    mov             r14,            psqdiff16x16
+    pshufd  xmm1,           xmm4,           00001110b
+    paddd   xmm4,           xmm1
+    pshufd  xmm1,           xmm4,           00000001b
+    paddd   xmm4,           xmm1
+    movd    [r14],          xmm4
+    add             r14,            4
+    mov             psqdiff16x16,   r14
 
-  add             r14,    16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
+    add             r14,    16
+    sub             r0,     r13
+    sub             r1,     r13
+    add             r0,     16
+    add             r1,     16
 
-  dec             r2
-  jnz             sqdiff_bgd_width_loop
-  pop r2
-  %assign push_num push_num-1
-  mov             r0,     r10
-  mov             r1,     r11
-  add             r0,     r13
-  add             r1,     r13
+    dec             r2
+    jnz             sqdiff_bgd_width_loop
+    pop r2
+    %assign push_num push_num-1
+    mov             r0,     r10
+    mov             r1,     r11
+    add             r0,     r13
+    add             r1,     r13
 
-  dec     r3
-  jnz             sqdiff_bgd_height_loop
+    dec     r3
+    jnz             sqdiff_bgd_height_loop
 
-  mov             r14,    psadframe
-  movd    [r14],  xmm8
+    mov             r14,    psadframe
+    movd    [r14],  xmm8
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 %assign push_num 0
 %undef          cur_data
 %undef          ref_data
@@ -2026,5 +2026,5 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 %endif