shithub: openh264

Download patch

ref: c2c355b62341874eae06f281386be73452d266da
parent: d9998335174171f0b37b0d963f8081cf13545972
parent: 57f6bcc4b0da529101c25fd97349e9e55a6a5cee
author: Ethan Hugg <[email protected]>
date: Mon Jun 2 03:12:30 EDT 2014

Merge pull request #911 from mstorsjo/reformat-asm

Convert all tabs to spaces in assembly sources, unify indentation

--- a/codec/common/arm/copy_mb_neon.S
+++ b/codec/common/arm/copy_mb_neon.S
@@ -35,76 +35,76 @@
 #include "arm_arch_common_macro.S"
 
 #ifdef __APPLE__
-.macro	LOAD_ALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, src*, src_stride
-    vld1.64	{$0}, [$4,:128], $5
-    vld1.64	{$1}, [$4,:128], $5
-    vld1.64	{$2}, [$4,:128], $5
-    vld1.64	{$3}, [$4,:128], $5
-//	}
+.macro LOAD_ALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, src*, src_stride
+    vld1.64 {$0}, [$4,:128], $5
+    vld1.64 {$1}, [$4,:128], $5
+    vld1.64 {$2}, [$4,:128], $5
+    vld1.64 {$3}, [$4,:128], $5
+//  }
 .endm
 
-.macro	STORE_ALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, dst*, dst_stride
-    vst1.64	{$0}, [$4,:128], $5
-    vst1.64	{$1}, [$4,:128], $5
-    vst1.64	{$2}, [$4,:128], $5
-    vst1.64	{$3}, [$4,:128], $5
-//	}
+.macro STORE_ALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, dst*, dst_stride
+    vst1.64 {$0}, [$4,:128], $5
+    vst1.64 {$1}, [$4,:128], $5
+    vst1.64 {$2}, [$4,:128], $5
+    vst1.64 {$3}, [$4,:128], $5
+//  }
 .endm
 
-.macro	LOAD_UNALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, src*, src_stride
-    vld1.64	{$0}, [$4], $5
-    vld1.64	{$1}, [$4], $5
-    vld1.64	{$2}, [$4], $5
-    vld1.64	{$3}, [$4], $5
-//	}
+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, src*, src_stride
+    vld1.64 {$0}, [$4], $5
+    vld1.64 {$1}, [$4], $5
+    vld1.64 {$2}, [$4], $5
+    vld1.64 {$3}, [$4], $5
+//  }
 .endm
 
-.macro	STORE_UNALIGNED_DATA_WITH_STRIDE
-//	{	//	input: $0~$3, dst*, dst_stride
-    vst1.64	{$0}, [$4], $5
-    vst1.64	{$1}, [$4], $5
-    vst1.64	{$2}, [$4], $5
-    vst1.64	{$3}, [$4], $5
-//	}
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE
+//  {   //  input: $0~$3, dst*, dst_stride
+    vst1.64 {$0}, [$4], $5
+    vst1.64 {$1}, [$4], $5
+    vst1.64 {$2}, [$4], $5
+    vst1.64 {$3}, [$4], $5
+//  }
 .endm
 #else
-.macro	LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, src*, src_stride
-    vld1.64	{\arg0}, [\arg4,:128], \arg5
-    vld1.64	{\arg1}, [\arg4,:128], \arg5
-    vld1.64	{\arg2}, [\arg4,:128], \arg5
-    vld1.64	{\arg3}, [\arg4,:128], \arg5
-//	}
+.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: \arg0~\arg3, src*, src_stride
+    vld1.64 {\arg0}, [\arg4,:128], \arg5
+    vld1.64 {\arg1}, [\arg4,:128], \arg5
+    vld1.64 {\arg2}, [\arg4,:128], \arg5
+    vld1.64 {\arg3}, [\arg4,:128], \arg5
+//  }
 .endm
 
-.macro	STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, dst*, dst_stride
-    vst1.64	{\arg0}, [\arg4,:128], \arg5
-    vst1.64	{\arg1}, [\arg4,:128], \arg5
-    vst1.64	{\arg2}, [\arg4,:128], \arg5
-    vst1.64	{\arg3}, [\arg4,:128], \arg5
-//	}
+.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: \arg0~\arg3, dst*, dst_stride
+    vst1.64 {\arg0}, [\arg4,:128], \arg5
+    vst1.64 {\arg1}, [\arg4,:128], \arg5
+    vst1.64 {\arg2}, [\arg4,:128], \arg5
+    vst1.64 {\arg3}, [\arg4,:128], \arg5
+//  }
 .endm
 
-.macro	LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, src*, src_stride
-    vld1.64	{\arg0}, [\arg4], \arg5
-    vld1.64	{\arg1}, [\arg4], \arg5
-    vld1.64	{\arg2}, [\arg4], \arg5
-    vld1.64	{\arg3}, [\arg4], \arg5
-//	}
+.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: \arg0~\arg3, src*, src_stride
+    vld1.64 {\arg0}, [\arg4], \arg5
+    vld1.64 {\arg1}, [\arg4], \arg5
+    vld1.64 {\arg2}, [\arg4], \arg5
+    vld1.64 {\arg3}, [\arg4], \arg5
+//  }
 .endm
 
-.macro	STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: \arg0~\arg3, dst*, dst_stride
-    vst1.64	{\arg0}, [\arg4], \arg5
-    vst1.64	{\arg1}, [\arg4], \arg5
-    vst1.64	{\arg2}, [\arg4], \arg5
-    vst1.64	{\arg3}, [\arg4], \arg5
-//	}
+.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: \arg0~\arg3, dst*, dst_stride
+    vst1.64 {\arg0}, [\arg4], \arg5
+    vst1.64 {\arg1}, [\arg4], \arg5
+    vst1.64 {\arg2}, [\arg4], \arg5
+    vst1.64 {\arg3}, [\arg4], \arg5
+//  }
 .endm
 
 #endif
@@ -112,13 +112,13 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d0, d1, d2, d3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d4, d5, d6, d7, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -125,21 +125,21 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q0, q1, q2, q3, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q0, q1, q2, q3, r0, r1
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q8, q9, q10, q11, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q8, q9, q10, q11, r0, r1
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q0, q1, q2, q3, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q0, q1, q2, q3, r0, r1
 
-	LOAD_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_ALIGNED_DATA_WITH_STRIDE   q8, q9, q10, q11, r2, r3
 
-	STORE_ALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_ALIGNED_DATA_WITH_STRIDE  q8, q9, q10, q11, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -146,21 +146,21 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q0, q1, q2, q3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q8, q9, q10, q11, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q0, q1, q2, q3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q8, q9, q10, q11, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -167,13 +167,13 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q0, q1, q2, q3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q0, q1, q2, q3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	q8, q9, q10, q11, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    q8, q9, q10, q11, r0, r1
 
 WELS_ASM_FUNC_END
 
@@ -180,21 +180,21 @@
 
 WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d0, d1, d2, d3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d4, d5, d6, d7, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d0, d1, d2, d3, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d0, d1, d2, d3, r0, r1
 
-	LOAD_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r2, r3
+    LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
 
-	STORE_UNALIGNED_DATA_WITH_STRIDE	d4, d5, d6, d7, r0, r1
+    STORE_UNALIGNED_DATA_WITH_STRIDE    d4, d5, d6, d7, r0, r1
 
 WELS_ASM_FUNC_END
 
--- a/codec/common/arm/deblocking_neon.S
+++ b/codec/common/arm/deblocking_neon.S
@@ -1,35 +1,35 @@
 /*!
-* \copy
-*     Copyright (c)  2013, Cisco Systems
-*     All rights reserved.
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
 
-*     Redistribution and use in source and binary forms, with or without
-*     modification, are permitted provided that the following conditions
-*     are met:
-
-*        * Redistributions of source code must retain the above copyright
-*          notice, this list of conditions and the following disclaimer.
-
-*        * Redistributions in binary form must reproduce the above copyright
-*          notice, this list of conditions and the following disclaimer in
-*          the documentation and/or other materials provided with the
-*          distribution.
-
-*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-*     POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
 #ifdef HAVE_NEON
 .text
 
@@ -36,815 +36,815 @@
 #include "arm_arch_common_macro.S"
 
 #ifdef __APPLE__
-.macro	JMP_IF_128BITS_IS_ZERO
-    vorr.s16	$2, $0, $1
-    vmov		r3, r2, $2
-    orr			r3, r3, r2
-    cmp			r3, #0
+.macro JMP_IF_128BITS_IS_ZERO
+    vorr.s16    $2, $0, $1
+    vmov        r3, r2, $2
+    orr         r3, r3, r2
+    cmp         r3, #0
 .endm
 
-.macro	MASK_MATRIX
-    vabd.u8	$6, $1, $2
-    vcgt.u8	$6, $4, $6
+.macro MASK_MATRIX
+    vabd.u8 $6, $1, $2
+    vcgt.u8 $6, $4, $6
 
-    vabd.u8	$4, $0, $1
-    vclt.u8	$4, $4, $5
-    vand.u8	$6, $6, $4
+    vabd.u8 $4, $0, $1
+    vclt.u8 $4, $4, $5
+    vand.u8 $6, $6, $4
 
-    vabd.u8	$4, $3, $2
-    vclt.u8	$4, $4, $5
-    vand.u8	$6, $6, $4
+    vabd.u8 $4, $3, $2
+    vclt.u8 $4, $4, $5
+    vand.u8 $6, $6, $4
 .endm
 
 
-.macro	DIFF_LUMA_LT4_P1_Q1
+.macro DIFF_LUMA_LT4_P1_Q1
     vmov.i8 $9, #128
-    vrhadd.u8	$8, $2, $3
-    vhadd.u8	$8, $0, $8
-    vsub.s8	$8, $8, $9
-    vsub.s8	$9, $1, $9
-    vqsub.s8	$8, $8, $9
-    vmax.s8	$8, $8, $5
-    vmin.s8	$8, $8, $6
-    vabd.u8	$9, $0, $2
-    vclt.u8	$9, $9, $4
-    vand.s8	$8, $8, $9
-    vand.s8	$8, $8, $7
-    vadd.u8	$8, $1, $8
-    vabs.s8	$9, $9
+    vrhadd.u8   $8, $2, $3
+    vhadd.u8    $8, $0, $8
+    vsub.s8 $8, $8, $9
+    vsub.s8 $9, $1, $9
+    vqsub.s8    $8, $8, $9
+    vmax.s8 $8, $8, $5
+    vmin.s8 $8, $8, $6
+    vabd.u8 $9, $0, $2
+    vclt.u8 $9, $9, $4
+    vand.s8 $8, $8, $9
+    vand.s8 $8, $8, $7
+    vadd.u8 $8, $1, $8
+    vabs.s8 $9, $9
 .endm
 
-.macro	DIFF_LUMA_LT4_P0_Q0
-    vsubl.u8	$5, $0, $3
-    vsubl.u8	$6, $2, $1
-    vshl.s16	$6, $6, #2
-    vadd.s16	$5, $5, $6
-    vqrshrn.s16		$4, $5, #3
+.macro DIFF_LUMA_LT4_P0_Q0
+    vsubl.u8    $5, $0, $3
+    vsubl.u8    $6, $2, $1
+    vshl.s16    $6, $6, #2
+    vadd.s16    $5, $5, $6
+    vqrshrn.s16     $4, $5, #3
 .endm
 
-.macro	DIFF_LUMA_EQ4_P2P1P0
-    vaddl.u8	q4, $1, $2
-    vaddl.u8	q5, $3, $4
-    vadd.u16	q5, q4, q5
+.macro DIFF_LUMA_EQ4_P2P1P0
+    vaddl.u8    q4, $1, $2
+    vaddl.u8    q5, $3, $4
+    vadd.u16    q5, q4, q5
 
-    vaddl.u8	q4, $0, $1
-    vshl.u16	q4, q4, #1
-    vadd.u16	q4, q5, q4
+    vaddl.u8    q4, $0, $1
+    vshl.u16    q4, q4, #1
+    vadd.u16    q4, q5, q4
 
-    vrshrn.u16		$0, q5, #2
-    vrshrn.u16		$7, q4, #3
+    vrshrn.u16      $0, q5, #2
+    vrshrn.u16      $7, q4, #3
 
-    vshl.u16	q5, q5, #1
-    vsubl.u8	q4, $5, $1
-    vadd.u16	q5, q4,q5
+    vshl.u16    q5, q5, #1
+    vsubl.u8    q4, $5, $1
+    vadd.u16    q5, q4,q5
 
-    vaddl.u8	q4, $2, $5
-    vaddw.u8	q4, q4, $2
-    vaddw.u8	q4, q4, $3
+    vaddl.u8    q4, $2, $5
+    vaddw.u8    q4, q4, $2
+    vaddw.u8    q4, q4, $3
 
-    vrshrn.u16		d10,q5, #3
-    vrshrn.u16		d8, q4, #2
-    vbsl.u8		$6, d10, d8
+    vrshrn.u16      d10,q5, #3
+    vrshrn.u16      d8, q4, #2
+    vbsl.u8     $6, d10, d8
 .endm
 
-.macro	DIFF_LUMA_EQ4_MASK
-    vmov	$3, $2
-    vbsl.u8	$3, $0, $1
+.macro DIFF_LUMA_EQ4_MASK
+    vmov    $3, $2
+    vbsl.u8 $3, $0, $1
 .endm
 
-.macro	DIFF_CHROMA_EQ4_P0Q0
-    vaddl.u8	$4, $0, $3
-    vaddw.u8	$5, $4, $1
-    vaddw.u8	$6, $4, $2
-    vaddw.u8	$5, $5, $0
+.macro DIFF_CHROMA_EQ4_P0Q0
+    vaddl.u8    $4, $0, $3
+    vaddw.u8    $5, $4, $1
+    vaddw.u8    $6, $4, $2
+    vaddw.u8    $5, $5, $0
 
-    vaddw.u8	$6, $6, $3
-    vrshrn.u16		$7, $5, #2
-    vrshrn.u16		$8, $6, #2
+    vaddw.u8    $6, $6, $3
+    vrshrn.u16      $7, $5, #2
+    vrshrn.u16      $8, $6, #2
 .endm
 
-.macro	LOAD_CHROMA_DATA_4
-    vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.macro LOAD_CHROMA_DATA_4
+    vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+    vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
 .endm
 
-.macro	STORE_CHROMA_DATA_4
-    vst4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vst4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.macro STORE_CHROMA_DATA_4
+    vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+    vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
 .endm
 
-.macro	LOAD_LUMA_DATA_3
-    vld3.u8	{$0[$6],$1[$6],$2[$6]}, [r2], r1
-    vld3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
+.macro LOAD_LUMA_DATA_3
+    vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
+    vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
 .endm
 
-.macro	STORE_LUMA_DATA_4
-    vst4.u8	{$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
-    vst4.u8	{$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+.macro STORE_LUMA_DATA_4
+    vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
+    vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
 .endm
 
-.macro	STORE_LUMA_DATA_3
-    vst3.u8	{$0[$6],$1[$6],$2[$6]}, [r3], r1
-    vst3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
+.macro STORE_LUMA_DATA_3
+    vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
+    vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
 .endm
 
-.macro	EXTRACT_DELTA_INTO_TWO_PART
-    vcge.s8	$1, $0, #0
-    vand	$1, $0, $1
-    vsub.s8	$0, $1, $0
+.macro EXTRACT_DELTA_INTO_TWO_PART
+    vcge.s8 $1, $0, #0
+    vand    $1, $0, $1
+    vsub.s8 $0, $1, $0
 .endm
 #else
-.macro	JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
-    vorr.s16	\arg2, \arg0, \arg1
-    vmov		r3, r2, \arg2
-    orr			r3, r3, r2
-    cmp			r3, #0
+.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
+    vorr.s16    \arg2, \arg0, \arg1
+    vmov        r3, r2, \arg2
+    orr         r3, r3, r2
+    cmp         r3, #0
 .endm
 
-.macro	MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vabd.u8	\arg6, \arg1, \arg2
-    vcgt.u8	\arg6, \arg4, \arg6
+.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vabd.u8 \arg6, \arg1, \arg2
+    vcgt.u8 \arg6, \arg4, \arg6
 
-    vabd.u8	\arg4, \arg0, \arg1
-    vclt.u8	\arg4, \arg4, \arg5
-    vand.u8	\arg6, \arg6, \arg4
+    vabd.u8 \arg4, \arg0, \arg1
+    vclt.u8 \arg4, \arg4, \arg5
+    vand.u8 \arg6, \arg6, \arg4
 
-    vabd.u8	\arg4, \arg3, \arg2
-    vclt.u8	\arg4, \arg4, \arg5
-    vand.u8	\arg6, \arg6, \arg4
+    vabd.u8 \arg4, \arg3, \arg2
+    vclt.u8 \arg4, \arg4, \arg5
+    vand.u8 \arg6, \arg6, \arg4
 .endm
 
-.macro	DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
     vmov.i8 \arg9, #128
-    vrhadd.u8	\arg8, \arg2, \arg3
-    vhadd.u8	\arg8, \arg0, \arg8
-    vsub.s8	\arg8, \arg8, \arg9
-    vsub.s8	\arg9, \arg1, \arg9
+    vrhadd.u8   \arg8, \arg2, \arg3
+    vhadd.u8    \arg8, \arg0, \arg8
+    vsub.s8 \arg8, \arg8, \arg9
+    vsub.s8 \arg9, \arg1, \arg9
     vqsub.s8    \arg8, \arg8, \arg9
-    vmax.s8	\arg8, \arg8, \arg5
-    vmin.s8	\arg8, \arg8, \arg6
-    vabd.u8	\arg9, \arg0, \arg2
-    vclt.u8	\arg9, \arg9, \arg4
-    vand.s8	\arg8, \arg8, \arg9
-    vand.s8	\arg8, \arg8, \arg7
-    vadd.u8	\arg8, \arg1, \arg8
-    vabs.s8	\arg9, \arg9
+    vmax.s8 \arg8, \arg8, \arg5
+    vmin.s8 \arg8, \arg8, \arg6
+    vabd.u8 \arg9, \arg0, \arg2
+    vclt.u8 \arg9, \arg9, \arg4
+    vand.s8 \arg8, \arg8, \arg9
+    vand.s8 \arg8, \arg8, \arg7
+    vadd.u8 \arg8, \arg1, \arg8
+    vabs.s8 \arg9, \arg9
 .endm
 
-.macro	DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vsubl.u8	\arg5, \arg0, \arg3
-    vsubl.u8	\arg6, \arg2, \arg1
-    vshl.s16	\arg6, \arg6, #2
-    vadd.s16	\arg5, \arg5, \arg6
-    vqrshrn.s16		\arg4, \arg5, #3
+.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vsubl.u8    \arg5, \arg0, \arg3
+    vsubl.u8    \arg6, \arg2, \arg1
+    vshl.s16    \arg6, \arg6, #2
+    vadd.s16    \arg5, \arg5, \arg6
+    vqrshrn.s16     \arg4, \arg5, #3
 .endm
 
 
-.macro	DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-    vaddl.u8	q4, \arg1, \arg2
-    vaddl.u8	q5, \arg3, \arg4
-    vadd.u16	q5, q4, q5
+.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+    vaddl.u8    q4, \arg1, \arg2
+    vaddl.u8    q5, \arg3, \arg4
+    vadd.u16    q5, q4, q5
 
-    vaddl.u8	q4, \arg0, \arg1
-    vshl.u16	q4, q4, #1
-    vadd.u16	q4, q5, q4
+    vaddl.u8    q4, \arg0, \arg1
+    vshl.u16    q4, q4, #1
+    vadd.u16    q4, q5, q4
 
-    vrshrn.u16		\arg0, q5, #2
-    vrshrn.u16		\arg7, q4, #3
+    vrshrn.u16      \arg0, q5, #2
+    vrshrn.u16      \arg7, q4, #3
 
-    vshl.u16	q5, q5, #1
-    vsubl.u8	q4, \arg5, \arg1
-    vadd.u16	q5, q4,q5
+    vshl.u16    q5, q5, #1
+    vsubl.u8    q4, \arg5, \arg1
+    vadd.u16    q5, q4,q5
 
-    vaddl.u8	q4, \arg2, \arg5
-    vaddw.u8	q4, q4, \arg2
-    vaddw.u8	q4, q4, \arg3
+    vaddl.u8    q4, \arg2, \arg5
+    vaddw.u8    q4, q4, \arg2
+    vaddw.u8    q4, q4, \arg3
 
-    vrshrn.u16		d10,q5, #3
-    vrshrn.u16		d8, q4, #2
-    vbsl.u8		\arg6, d10, d8
+    vrshrn.u16      d10,q5, #3
+    vrshrn.u16      d8, q4, #2
+    vbsl.u8     \arg6, d10, d8
 .endm
 
-.macro	DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
-    vmov	\arg3, \arg2
-    vbsl.u8	\arg3, \arg0, \arg1
+.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+    vmov    \arg3, \arg2
+    vbsl.u8 \arg3, \arg0, \arg1
 .endm
 
-.macro	DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vaddl.u8	\arg4, \arg0, \arg3
-    vaddw.u8	\arg5, \arg4, \arg1
-    vaddw.u8	\arg6, \arg4, \arg2
-    vaddw.u8	\arg5, \arg5, \arg0
-    vaddw.u8	\arg6, \arg6, \arg3
-    vrshrn.u16		\arg7, \arg5, #2
-    vrshrn.u16		\arg8, \arg6, #2
+.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vaddl.u8    \arg4, \arg0, \arg3
+    vaddw.u8    \arg5, \arg4, \arg1
+    vaddw.u8    \arg6, \arg4, \arg2
+    vaddw.u8    \arg5, \arg5, \arg0
+    vaddw.u8    \arg6, \arg6, \arg3
+    vrshrn.u16      \arg7, \arg5, #2
+    vrshrn.u16      \arg8, \arg6, #2
 .endm
 
-.macro	LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
-    vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.macro LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+    vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
 .endm
 
-.macro	STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vst4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
-    vst4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+    vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
 .endm
 
-.macro	LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vld3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
-    vld3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.macro LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
+    vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
 .endm
 
-.macro	STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
-    vst4.u8	{\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
-    vst4.u8	{\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+    vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
+    vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
 .endm
 
-.macro	STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vst3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
-    vst3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
+    vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
 .endm
 
-.macro	EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
-    vcge.s8	\arg1, \arg0, #0
-    vand	\arg1, \arg0, \arg1
-    vsub.s8	\arg0, \arg1, \arg0
+.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+    vcge.s8 \arg1, \arg0, #0
+    vand    \arg1, \arg0, \arg1
+    vsub.s8 \arg0, \arg1, \arg0
 .endm
 #endif
 
 WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
-    vpush	{q4-q7}
-    vdup.u8	q11, r2
-    vdup.u8	q9, r3
+    vpush   {q4-q7}
+    vdup.u8 q11, r2
+    vdup.u8 q9, r3
 
-    add			r2, r1, r1, lsl #1
-    sub			r2, r0, r2
-    vld1.u8	{q0}, [r2], r1
-    vld1.u8	{q3}, [r0], r1
-    vld1.u8	{q1}, [r2], r1
-    vld1.u8	{q4}, [r0], r1
-    vld1.u8	{q2}, [r2]
-    vld1.u8	{q5}, [r0]
-    sub			r2, r2, r1
+    add         r2, r1, r1, lsl #1
+    sub         r2, r0, r2
+    vld1.u8 {q0}, [r2], r1
+    vld1.u8 {q3}, [r0], r1
+    vld1.u8 {q1}, [r2], r1
+    vld1.u8 {q4}, [r0], r1
+    vld1.u8 {q2}, [r2]
+    vld1.u8 {q5}, [r0]
+    sub         r2, r2, r1
 
-    ldr			r3, [sp, #64]
-    vld1.s8	{d31}, [r3]
-    vdup.s8	d28, d31[0]
-    vdup.s8	d30, d31[1]
-    vdup.s8	d29, d31[2]
-    vdup.s8	d31, d31[3]
-    vtrn.32	d28, d30
-    vtrn.32	d29, d31
-    vcge.s8	q10, q14, #0
+    ldr         r3, [sp, #64]
+    vld1.s8 {d31}, [r3]
+    vdup.s8 d28, d31[0]
+    vdup.s8 d30, d31[1]
+    vdup.s8 d29, d31[2]
+    vdup.s8 d31, d31[3]
+    vtrn.32 d28, d30
+    vtrn.32 d29, d31
+    vcge.s8 q10, q14, #0
 
-    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
-    vand.u8	q10, q10, q15
+    MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+    vand.u8 q10, q10, q15
 
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
-    vst1.u8	{q6}, [r2], r1
+    DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+    vst1.u8 {q6}, [r2], r1
 
-    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+    DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
 
-    vabs.s8	q12, q12
-    vabs.s8	q13, q13
-    vadd.u8	q14,q14,q12
-    vadd.u8	q14,q14,q13
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vabs.s8 q12, q12
+    vabs.s8 q13, q13
+    vadd.u8 q14,q14,q12
+    vadd.u8 q14,q14,q13
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-    vand.s8	q8, q8, q10
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
-    vqadd.u8	q2, q2, q9
-    vqsub.u8	q2, q2, q8
-    vst1.u8	{q2}, [r2], r1
-    vqsub.u8	q3, q3, q9
-    vqadd.u8	q3, q3, q8
-    vst1.u8	{q3}, [r2]	, r1
-    vst1.u8	{q7}, [r2]
+    DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
+    vand.s8 q8, q8, q10
+    EXTRACT_DELTA_INTO_TWO_PART q8, q9
+    vqadd.u8    q2, q2, q9
+    vqsub.u8    q2, q2, q8
+    vst1.u8 {q2}, [r2], r1
+    vqsub.u8    q3, q3, q9
+    vqadd.u8    q3, q3, q8
+    vst1.u8 {q3}, [r2]  , r1
+    vst1.u8 {q7}, [r2]
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
-    vpush	{q4-q7}
+    vpush   {q4-q7}
 
-    vdup.u8	q5, r2
-    vdup.u8	q4, r3
+    vdup.u8 q5, r2
+    vdup.u8 q4, r3
 
-    sub			r3, r0, r1, lsl #2
-    vld1.u8	{q8},  [r3], r1
-    vld1.u8	{q12}, [r0], r1
-    vld1.u8	{q9},  [r3], r1
-    vld1.u8	{q13}, [r0], r1
-    vld1.u8	{q10}, [r3], r1
-    vld1.u8	{q14}, [r0], r1
-    vld1.u8	{q11}, [r3]
-    vld1.u8	{q15}, [r0]
-    sub			r3, r3, r1	, lsl #1
+    sub         r3, r0, r1, lsl #2
+    vld1.u8 {q8},  [r3], r1
+    vld1.u8 {q12}, [r0], r1
+    vld1.u8 {q9},  [r3], r1
+    vld1.u8 {q13}, [r0], r1
+    vld1.u8 {q10}, [r3], r1
+    vld1.u8 {q14}, [r0], r1
+    vld1.u8 {q11}, [r3]
+    vld1.u8 {q15}, [r0]
+    sub         r3, r3, r1  , lsl #1
 
-    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
+    MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
 
-    mov			r2, r2, lsr #2
-    add			r2, r2, #2
-    vdup.u8	q5, r2
-    vabd.u8	q0, q11, q12
-    vclt.u8	q7, q0, q5
+    mov         r2, r2, lsr #2
+    add         r2, r2, #2
+    vdup.u8 q5, r2
+    vabd.u8 q0, q11, q12
+    vclt.u8 q7, q0, q5
 
-    vabd.u8	q1, q9, q11
-    vclt.u8	q1, q1, q4
-    vand.s8	q1, q1, q7
+    vabd.u8 q1, q9, q11
+    vclt.u8 q1, q1, q4
+    vand.s8 q1, q1, q7
 
-    vabd.u8	q2, q14,q12
-    vclt.u8	q2, q2, q4
-    vand.s8	q2, q2, q7
-    vand.u8	q7, q7, q6
+    vabd.u8 q2, q14,q12
+    vclt.u8 q2, q2, q4
+    vand.s8 q2, q2, q7
+    vand.u8 q7, q7, q6
 
-    vmov		q3, q1
+    vmov        q3, q1
 
-    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
-    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
+    DIFF_LUMA_EQ4_P2P1P0        d16, d18, d20, d22, d24, d26, d2, d0
+    DIFF_LUMA_EQ4_P2P1P0        d17, d19, d21, d23, d25, d27, d3, d1
 
-    vand.u8	q3, q7, q3
-    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q8,q10, q3, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q4
-    vst1.u8	{q4}, [r3], r1
+    vand.u8 q3, q7, q3
+    DIFF_LUMA_EQ4_MASK  q0, q9, q3, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q8,q10, q3, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q1,q11, q6, q4
+    vst1.u8 {q4}, [r3], r1
 
-    vmov		q0, q2
-    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d6
-    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d7
+    vmov        q0, q2
+    DIFF_LUMA_EQ4_P2P1P0        d30, d28, d26, d24, d22, d20, d4, d6
+    DIFF_LUMA_EQ4_P2P1P0        d31, d29, d27, d25, d23, d21, d5, d7
 
-    vand.u8	q0, q7, q0
-    DIFF_LUMA_EQ4_MASK	q2,  q12, q6, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q15, q13, q0, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q3,  q14, q0, q4
-    vst1.u8	{q4}, [r3], r1
+    vand.u8 q0, q7, q0
+    DIFF_LUMA_EQ4_MASK  q2,  q12, q6, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q15, q13, q0, q4
+    vst1.u8 {q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK  q3,  q14, q0, q4
+    vst1.u8 {q4}, [r3], r1
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
-    vpush	{q4-q7}
+    vpush   {q4-q7}
 
-    vdup.u8	q11, r2
-    vdup.u8	q9, r3
+    vdup.u8 q11, r2
+    vdup.u8 q9, r3
 
-    sub			r2, r0, #3
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 0
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 1
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 2
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 3
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 4
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 5
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 6
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 7
+    sub         r2, r0, #3
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 0
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 1
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 2
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 3
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 4
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 5
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 6
+    LOAD_LUMA_DATA_3        d0, d1, d2, d6, d7, d8, 7
 
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 0
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 1
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 2
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 3
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 4
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 5
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 6
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 7
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 0
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 1
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 2
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 3
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 4
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 5
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 6
+    LOAD_LUMA_DATA_3        d3, d4, d5, d9, d10, d11, 7
 
-    vswp		d1, d2
-    vswp		d3, d4
-    vswp		d1, d4
-    vswp		d7, d8
-    vswp		d9, d10
-    vswp		d7, d10
+    vswp        d1, d2
+    vswp        d3, d4
+    vswp        d1, d4
+    vswp        d7, d8
+    vswp        d9, d10
+    vswp        d7, d10
 
-    sub			r0, r0, r1, lsl #4
+    sub         r0, r0, r1, lsl #4
 
-    ldr			r3, [sp, #64]
-    vld1.s8	{d31}, [r3]
-    vdup.s8	d28, d31[0]
-    vdup.s8	d30, d31[1]
-    vdup.s8	d29, d31[2]
-    vdup.s8	d31, d31[3]
-    vtrn.32	d28, d30
-    vtrn.32	d29, d31
-    vcge.s8	q10, q14, #0
+    ldr         r3, [sp, #64]
+    vld1.s8 {d31}, [r3]
+    vdup.s8 d28, d31[0]
+    vdup.s8 d30, d31[1]
+    vdup.s8 d29, d31[2]
+    vdup.s8 d31, d31[3]
+    vtrn.32 d28, d30
+    vtrn.32 d29, d31
+    vcge.s8 q10, q14, #0
 
-    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
-    vand.u8	q10, q10, q15
+    MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
+    vand.u8 q10, q10, q15
 
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
-    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+    DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+    DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
 
-    vabs.s8	q12, q12
-    vabs.s8	q13, q13
-    vadd.u8	q14,q14,q12
-    vadd.u8	q14,q14,q13
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vabs.s8 q12, q12
+    vabs.s8 q13, q13
+    vadd.u8 q14,q14,q12
+    vadd.u8 q14,q14,q13
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-    vand.s8	q8, q8, q10
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
-    vqadd.u8	q2, q2, q9
-    vqsub.u8	q2, q2, q8
+    DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
+    vand.s8 q8, q8, q10
+    EXTRACT_DELTA_INTO_TWO_PART q8, q9
+    vqadd.u8    q2, q2, q9
+    vqsub.u8    q2, q2, q8
 
-    vqsub.u8	q3, q3, q9
-    vqadd.u8	q3, q3, q8
+    vqsub.u8    q3, q3, q9
+    vqadd.u8    q3, q3, q8
 
-    sub		r0, #2
-    add		r2, r0, r1
-    lsl		r1, #1
+    sub     r0, #2
+    add     r2, r0, r1
+    lsl     r1, #1
 
-    vmov		q1, q6
-    vmov		q4, q7
+    vmov        q1, q6
+    vmov        q4, q7
 
-    vswp		q2, q3
-    vswp		d3, d6
-    vswp		d5, d8
+    vswp        q2, q3
+    vswp        d3, d6
+    vswp        d5, d8
 
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 0, 1
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 2, 3
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 4, 5
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 6, 7
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 0, 1
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 2, 3
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 4, 5
+    STORE_LUMA_DATA_4       d2, d3, d4, d5, 6, 7
 
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 0, 1
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 2, 3
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 4, 5
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 6, 7
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 0, 1
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 2, 3
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 4, 5
+    STORE_LUMA_DATA_4       d6, d7, d8, d9, 6, 7
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
-    vpush	{q4-q7}
-    vdup.u8	q5, r2
-    vdup.u8	q4, r3
+    vpush   {q4-q7}
+    vdup.u8 q5, r2
+    vdup.u8 q4, r3
 
-    sub			r3, r0, #4				//	pix -= 4
+    sub         r3, r0, #4              //  pix -= 4
 
-    vld1.u8	{d16}, [r3], r1
-    vld1.u8	{d17}, [r3], r1
-    vld1.u8	{d18}, [r3], r1
-    vld1.u8	{d19}, [r3], r1
-    vld1.u8	{d20}, [r3], r1
-    vld1.u8	{d21}, [r3], r1
-    vld1.u8	{d22}, [r3], r1
-    vld1.u8	{d23}, [r3], r1
-    vld1.u8	{d24}, [r3], r1
-    vld1.u8	{d25}, [r3], r1
-    vld1.u8	{d26}, [r3], r1
-    vld1.u8	{d27}, [r3], r1
-    vld1.u8	{d28}, [r3], r1
-    vld1.u8	{d29}, [r3], r1
-    vld1.u8	{d30}, [r3], r1
-    vld1.u8	{d31}, [r3], r1
+    vld1.u8 {d16}, [r3], r1
+    vld1.u8 {d17}, [r3], r1
+    vld1.u8 {d18}, [r3], r1
+    vld1.u8 {d19}, [r3], r1
+    vld1.u8 {d20}, [r3], r1
+    vld1.u8 {d21}, [r3], r1
+    vld1.u8 {d22}, [r3], r1
+    vld1.u8 {d23}, [r3], r1
+    vld1.u8 {d24}, [r3], r1
+    vld1.u8 {d25}, [r3], r1
+    vld1.u8 {d26}, [r3], r1
+    vld1.u8 {d27}, [r3], r1
+    vld1.u8 {d28}, [r3], r1
+    vld1.u8 {d29}, [r3], r1
+    vld1.u8 {d30}, [r3], r1
+    vld1.u8 {d31}, [r3], r1
 
-    vtrn.u32	d16, d20
-    vtrn.u32	d17, d21
-    vtrn.u32	d18, d22
-    vtrn.u32	d19, d23
-    vtrn.u32	d24, d28
-    vtrn.u32	d25, d29
-    vtrn.u32	d26, d30
-    vtrn.u32	d27, d31
+    vtrn.u32    d16, d20
+    vtrn.u32    d17, d21
+    vtrn.u32    d18, d22
+    vtrn.u32    d19, d23
+    vtrn.u32    d24, d28
+    vtrn.u32    d25, d29
+    vtrn.u32    d26, d30
+    vtrn.u32    d27, d31
 
-    vtrn.u16	d16, d18
-    vtrn.u16	d17, d19
-    vtrn.u16	d20, d22
-    vtrn.u16	d21, d23
-    vtrn.u16	d24, d26
-    vtrn.u16	d25, d27
-    vtrn.u16	d28, d30
-    vtrn.u16	d29, d31
+    vtrn.u16    d16, d18
+    vtrn.u16    d17, d19
+    vtrn.u16    d20, d22
+    vtrn.u16    d21, d23
+    vtrn.u16    d24, d26
+    vtrn.u16    d25, d27
+    vtrn.u16    d28, d30
+    vtrn.u16    d29, d31
 
-    vtrn.u8	d16, d17
-    vtrn.u8	d18, d19
-    vtrn.u8	d20, d21
-    vtrn.u8	d22, d23
-    vtrn.u8	d24, d25
-    vtrn.u8	d26, d27
-    vtrn.u8	d28, d29
-    vtrn.u8	d30, d31
+    vtrn.u8 d16, d17
+    vtrn.u8 d18, d19
+    vtrn.u8 d20, d21
+    vtrn.u8 d22, d23
+    vtrn.u8 d24, d25
+    vtrn.u8 d26, d27
+    vtrn.u8 d28, d29
+    vtrn.u8 d30, d31
 
-    vswp	d17, d24
-    vswp	d19, d26
-    vswp	d21, d28
-    vswp	d23, d30
+    vswp    d17, d24
+    vswp    d19, d26
+    vswp    d21, d28
+    vswp    d23, d30
 
-    vswp	q12, q9
-    vswp	q14, q11
+    vswp    q12, q9
+    vswp    q14, q11
 
-    vswp	q12, q10
-    vswp	q13, q11
+    vswp    q12, q10
+    vswp    q13, q11
 
-    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
+    MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
 
-    mov			r2, r2, lsr #2
-    add			r2, r2, #2
-    vdup.u8	q5, r2
-    vabd.u8	q0, q11, q12
-    vclt.u8	q7, q0, q5
+    mov         r2, r2, lsr #2
+    add         r2, r2, #2
+    vdup.u8 q5, r2
+    vabd.u8 q0, q11, q12
+    vclt.u8 q7, q0, q5
 
-    vabd.u8	q1, q9, q11
-    vclt.u8	q1, q1, q4
-    vand.s8	q1, q1, q7
+    vabd.u8 q1, q9, q11
+    vclt.u8 q1, q1, q4
+    vand.s8 q1, q1, q7
 
-    vabd.u8	q2, q14,q12
-    vclt.u8	q2, q2, q4
-    vand.s8	q2, q2, q7
-    vand.u8	q7, q7, q6
+    vabd.u8 q2, q14,q12
+    vclt.u8 q2, q2, q4
+    vand.s8 q2, q2, q7
+    vand.u8 q7, q7, q6
 
-    vmov		q3, q1
+    vmov        q3, q1
 
-    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
-    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
+    DIFF_LUMA_EQ4_P2P1P0        d16, d18, d20, d22, d24, d26, d2, d0
+    DIFF_LUMA_EQ4_P2P1P0        d17, d19, d21, d23, d25, d27, d3, d1
 
-    vand.u8	q3, q7, q3
-    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
-    vmov		q9, q4
-    vbsl.u8	q3, q8, q10
-    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q8
+    vand.u8 q3, q7, q3
+    DIFF_LUMA_EQ4_MASK  q0, q9, q3, q4
+    vmov        q9, q4
+    vbsl.u8 q3, q8, q10
+    DIFF_LUMA_EQ4_MASK  q1,q11, q6, q8
 
-    vand.u8	q7, q7, q2
+    vand.u8 q7, q7, q2
 
-    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d0
-    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d1
+    DIFF_LUMA_EQ4_P2P1P0        d30, d28, d26, d24, d22, d20, d4, d0
+    DIFF_LUMA_EQ4_P2P1P0        d31, d29, d27, d25, d23, d21, d5, d1
 
-    vbsl.u8	q6, q2, q12
-    DIFF_LUMA_EQ4_MASK	q15, q13, q7, q4
+    vbsl.u8 q6, q2, q12
+    DIFF_LUMA_EQ4_MASK  q15, q13, q7, q4
 
-    vbsl.u8	q7, q0, q14
+    vbsl.u8 q7, q0, q14
 
-    vmov		q5, q6
-    vmov		q2, q9
-    vmov		q6, q4
-    vmov		q4, q8
+    vmov        q5, q6
+    vmov        q2, q9
+    vmov        q6, q4
+    vmov        q4, q8
 
-    vswp	d8, d6
-    vswp	d5, d7
-    vswp	d5, d8
-    vswp	d14, d12
-    vswp	d11, d13
-    vswp	d11, d14
+    vswp    d8, d6
+    vswp    d5, d7
+    vswp    d5, d8
+    vswp    d14, d12
+    vswp    d11, d13
+    vswp    d11, d14
 
-    sub		r3, r0, #3
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,0
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,1
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,2
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,3
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,4
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,5
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,6
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,7
+    sub     r3, r0, #3
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,0
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,1
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,2
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,3
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,4
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,5
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,6
+    STORE_LUMA_DATA_3       d4,d5,d6,d10,d11,d12,7
 
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,0
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,1
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,2
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,3
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,4
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,5
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,6
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,7
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,0
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,1
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,2
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,3
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,4
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,5
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,6
+    STORE_LUMA_DATA_3       d7,d8,d9,d13,d14,d15,7
 
-    vpop	{q4-q7}
+    vpop    {q4-q7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #0]
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #0]
 
-    sub			r0, r0, r2	, lsl #1
-    sub			r1, r1, r2, lsl #1
-    vdup.u8	    q9, r3
-    ldr			r3, [sp, #4]
+    sub         r0, r0, r2  , lsl #1
+    sub         r1, r1, r2, lsl #1
+    vdup.u8     q9, r3
+    ldr         r3, [sp, #4]
 
-    vld1.u8	{d0}, [r0], r2
-    vld1.u8	{d1}, [r1], r2
-    vld1.u8	{d2}, [r0], r2
-    vld1.u8	{d3}, [r1], r2
-    vld1.u8	{d4}, [r0], r2
-    vld1.u8	{d5}, [r1], r2
-    vld1.u8	{d6}, [r0]
-    vld1.u8	{d7}, [r1]
+    vld1.u8 {d0}, [r0], r2
+    vld1.u8 {d1}, [r1], r2
+    vld1.u8 {d2}, [r0], r2
+    vld1.u8 {d3}, [r1], r2
+    vld1.u8 {d4}, [r0], r2
+    vld1.u8 {d5}, [r1], r2
+    vld1.u8 {d6}, [r0]
+    vld1.u8 {d7}, [r1]
 
-    sub			r0, r0, r2, lsl #1
-    sub			r1, r1, r2, lsl #1
+    sub         r0, r0, r2, lsl #1
+    sub         r1, r1, r2, lsl #1
 
-    vld1.s8	{d31}, [r3]
-    vmovl.u8	q14,d31
-    vshl.u64	d29,d28,#8
-    vorr		d28,d29
-    vmov		d29, d28
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vld1.s8 {d31}, [r3]
+    vmovl.u8    q14,d31
+    vshl.u64    d29,d28,#8
+    vorr        d28,d29
+    vmov        d29, d28
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
 
-    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
+    DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
 
-    vand.s8	q8, q8, q10
-    vcge.s8	q14, q14, #0
-    vand.s8	q8, q8, q14
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
-    vqadd.u8	q1, q1, q10
-    vqsub.u8	q1, q1, q8
-    vst1.u8	{d2}, [r0], r2
-    vst1.u8	{d3}, [r1], r2
-    vqsub.u8	q2, q2, q10
-    vqadd.u8	q2, q2, q8
-    vst1.u8	{d4}, [r0]
-    vst1.u8	{d5}, [r1]
+    vand.s8 q8, q8, q10
+    vcge.s8 q14, q14, #0
+    vand.s8 q8, q8, q14
+    EXTRACT_DELTA_INTO_TWO_PART q8, q10
+    vqadd.u8    q1, q1, q10
+    vqsub.u8    q1, q1, q8
+    vst1.u8 {d2}, [r0], r2
+    vst1.u8 {d3}, [r1], r2
+    vqsub.u8    q2, q2, q10
+    vqadd.u8    q2, q2, q8
+    vst1.u8 {d4}, [r0]
+    vst1.u8 {d5}, [r1]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
-    vpush	{q4-q5}
+    vpush   {q4-q5}
 
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #32]
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #32]
 
-    sub			r0, r0, r2	, lsl #1
-    sub			r1, r1, r2, lsl #1
-    vdup.u8	q9, r3
-    vld1.u8	{d0}, [r0], r2		//	q0::p1
-    vld1.u8	{d1}, [r1], r2
-    vld1.u8	{d2}, [r0], r2		//	q1::p0
-    vld1.u8	{d3}, [r1], r2
-    vld1.u8	{d4}, [r0], r2		//	q2::q0
-    vld1.u8	{d5}, [r1], r2
-    vld1.u8	{d6}, [r0]				//	q3::q1
-    vld1.u8	{d7}, [r1]
+    sub         r0, r0, r2  , lsl #1
+    sub         r1, r1, r2, lsl #1
+    vdup.u8 q9, r3
+    vld1.u8 {d0}, [r0], r2      //  q0::p1
+    vld1.u8 {d1}, [r1], r2
+    vld1.u8 {d2}, [r0], r2      //  q1::p0
+    vld1.u8 {d3}, [r1], r2
+    vld1.u8 {d4}, [r0], r2      //  q2::q0
+    vld1.u8 {d5}, [r1], r2
+    vld1.u8 {d6}, [r0]              //  q3::q1
+    vld1.u8 {d7}, [r1]
 
-    sub			r0, r0, r2, lsl #1	//	pix = [-1*src_stride]
-    sub			r1, r1, r2, lsl #1
+    sub         r0, r0, r2, lsl #1  //  pix = [-1*src_stride]
+    sub         r1, r1, r2, lsl #1
 
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
 
-    vmov			q11, q10
+    vmov            q11, q10
 
-    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q4, q5, q8, d30, d0		// Cb::p0' q0'
-    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q12, q13, q14, d31, d1	// Cr::p0' q0'
+    DIFF_CHROMA_EQ4_P0Q0        d0, d2, d4, d6, q4, q5, q8, d30, d0     // Cb::p0' q0'
+    DIFF_CHROMA_EQ4_P0Q0        d1, d3, d5, d7, q12, q13, q14, d31, d1  // Cr::p0' q0'
 
-    vbsl.u8	q10, q15, q1
-    vst1.u8	{d20}, [r0], r2
-    vst1.u8	{d21}, [r1], r2
+    vbsl.u8 q10, q15, q1
+    vst1.u8 {d20}, [r0], r2
+    vst1.u8 {d21}, [r1], r2
 
-    vbsl.u8	q11, q0, q2
-    vst1.u8	{d22}, [r0]
-    vst1.u8	{d23}, [r1]
+    vbsl.u8 q11, q0, q2
+    vst1.u8 {d22}, [r0]
+    vst1.u8 {d23}, [r1]
 
-    vpop	{q4-q5}
+    vpop    {q4-q5}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
 
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #0]
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #0]
 
-    sub			r0, r0, #2
-    vdup.u8	q9, r3
-    ldr			r3, [sp, #4]
-    sub			r1, r1, #2
-    vld1.s8	{d31}, [r3]
+    sub         r0, r0, #2
+    vdup.u8 q9, r3
+    ldr         r3, [sp, #4]
+    sub         r1, r1, #2
+    vld1.s8 {d31}, [r3]
 
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-    vswp		q1, q2
-    vswp		d1, d2
-    vswp		d6, d5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 0
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 1
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 2
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 3
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 4
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 6
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vswp        q1, q2
+    vswp        d1, d2
+    vswp        d6, d5
 
-    vmovl.u8	q14, d31
-    vshl.u64	d29,d28,#8
-    vorr		d28,d29
-    vmov		d29, d28
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
+    vmovl.u8    q14, d31
+    vshl.u64    d29,d28,#8
+    vorr        d28,d29
+    vmov        d29, d28
+    veor        q15, q15
+    vsub.i8 q15,q15,q14
 
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
 
-    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
+    DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d17, q12, q13
+    vmax.s8 q8, q8, q15
+    vmin.s8 q8, q8, q14
 
-    vand.s8	q8, q8, q10
-    vcge.s8	q14, q14, #0
-    vand.s8	q8, q8, q14
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
-    vqadd.u8	q1, q1, q10
-    vqsub.u8	q1, q1, q8
-    vqsub.u8	q2, q2, q10
-    vqadd.u8	q2, q2, q8
+    vand.s8 q8, q8, q10
+    vcge.s8 q14, q14, #0
+    vand.s8 q8, q8, q14
+    EXTRACT_DELTA_INTO_TWO_PART q8, q10
+    vqadd.u8    q1, q1, q10
+    vqsub.u8    q1, q1, q8
+    vqsub.u8    q2, q2, q10
+    vqadd.u8    q2, q2, q8
 
-    sub			r0, r0, r2, lsl #3
-    sub			r1, r1, r2, lsl #3
-    vswp		d1, d2
-    vswp		d6, d5
-    vswp		q1, q2
+    sub         r0, r0, r2, lsl #3
+    sub         r1, r1, r2, lsl #3
+    vswp        d1, d2
+    vswp        d6, d5
+    vswp        q1, q2
 
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
 
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
-    vpush	{q4-q5}
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #32]
+    vpush   {q4-q5}
+    vdup.u8 q11, r3
+    ldr         r3, [sp, #32]
 
-    sub			r0, r0, #2
-    sub			r1, r1, #2
+    sub         r0, r0, #2
+    sub         r1, r1, #2
 
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-    vswp		q1, q2
-    vswp		d1, d2
-    vswp		d6, d5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 0
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 1
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 2
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 3
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 4
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 5
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 6
+    LOAD_CHROMA_DATA_4  d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vswp        q1, q2
+    vswp        d1, d2
+    vswp        d6, d5
 
-    vdup.u8	q9, r3
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
-    vmov			q11, q10
+    vdup.u8 q9, r3
+    MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
+    vmov            q11, q10
 
-    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q8, q9, q12, d8, d10
-    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q13, q14, q15, d9, d11
+    DIFF_CHROMA_EQ4_P0Q0        d0, d2, d4, d6, q8, q9, q12, d8, d10
+    DIFF_CHROMA_EQ4_P0Q0        d1, d3, d5, d7, q13, q14, q15, d9, d11
 
-    vbsl.u8	q10, q4, q1
-    vbsl.u8	q11, q5, q2
-    sub			r0, r0, r2, lsl #3	//	pix: 0th row	[-2]
-    sub			r1, r1, r2, lsl #3
+    vbsl.u8 q10, q4, q1
+    vbsl.u8 q11, q5, q2
+    sub         r0, r0, r2, lsl #3  //  pix: 0th row    [-2]
+    sub         r1, r1, r2, lsl #3
 
-    vmov		q1, q10
-    vmov		q2, q11
-    vswp		d1, d2
-    vswp		d6, d5
-    vswp		q1, q2
-    //	Cb:d0d1d2d3, Cr:d4d5d6d7
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vmov        q1, q10
+    vmov        q2, q11
+    vswp        d1, d2
+    vswp        d6, d5
+    vswp        q1, q2
+    //  Cb:d0d1d2d3, Cr:d4d5d6d7
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
+    STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
 
-    vpop	{q4-q5}
+    vpop    {q4-q5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
 
-    vld1.64	{d0-d2}, [r0]
+    vld1.64 {d0-d2}, [r0]
 
-    vceq.s8	q0, q0, #0
-    vceq.s8	d2, d2, #0
-    vmvn	q0, q0
-    vmvn	d2, d2
-    vabs.s8	q0, q0
-    vabs.s8	d2, d2
+    vceq.s8 q0, q0, #0
+    vceq.s8 d2, d2, #0
+    vmvn    q0, q0
+    vmvn    d2, d2
+    vabs.s8 q0, q0
+    vabs.s8 d2, d2
 
-    vst1.64	{d0-d2}, [r0]
+    vst1.64 {d0-d2}, [r0]
 WELS_ASM_FUNC_END
 
 #ifdef __APPLE__
@@ -851,37 +851,37 @@
 .macro BS_NZC_CHECK
     vld1.8   {d0,d1}, [$0]
     /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_nzc_check_jump0
+    ands     r6, $1, #2
+    beq      bs_nzc_check_jump0
 
     sub      r6, $0, $2, lsl #4
-	sub      r6, $2, lsl #3
+    sub      r6, $2, lsl #3
     add      r6, #12
     vld1.32  d3[1], [r6]
 
 bs_nzc_check_jump0:
     vext.8   q1, q1, q0, #12
-	vadd.u8  $3, q0, q1
+    vadd.u8  $3, q0, q1
 
 
     /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_nzc_check_jump1
+    ands     r6, $1, #1
+    beq      bs_nzc_check_jump1
 
     sub      r6, $0, #21
-	add      r7, r6, #4
+    add      r7, r6, #4
     vld1.8   d3[4], [r6]
-	add      r6, r7, #4
+    add      r6, r7, #4
     vld1.8   d3[5], [r7]
-	add      r7, r6, #4
+    add      r7, r6, #4
     vld1.8   d3[6], [r6]
     vld1.8   d3[7], [r7]
 
 bs_nzc_check_jump1:
-	vzip.8   d0, d1
-	vzip.8   d0, d1
+    vzip.8   d0, d1
+    vzip.8   d0, d1
     vext.8   q1, q1, q0, #12
-	vadd.u8  $4, q0, q1
+    vadd.u8  $4, q0, q1
 .endm
 
 .macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
@@ -888,7 +888,7 @@
     mov       r6, #4
     vabd.s16  q8, $0, $1
     vabd.s16  q9, $1, $2
-	vdup.s16  $0, r6
+    vdup.s16  $0, r6
     vabd.s16  q10, $2, $3
     vabd.s16  q11, $3, $4
 
@@ -897,7 +897,7 @@
     vcge.s16  q10, $0
     vcge.s16  q11, $0
 
-	vpadd.i16 d16, d16, d17
+    vpadd.i16 d16, d16, d17
     vpadd.i16 d17, d18, d19
     vpadd.i16 d18, d20, d21
     vpadd.i16 d19, d22, d23
@@ -910,8 +910,8 @@
     vldm   $0, {q0,q1,q2,q3}
 
     /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_mv_check_jump0
+    ands     r6, $1, #2
+    beq      bs_mv_check_jump0
 
     sub      r6, $0, $2, lsl #6
     add      r6, #48
@@ -921,22 +921,22 @@
     BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
 
     /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_mv_check_jump1
+    ands     r6, $1, #1
+    beq      bs_mv_check_jump1
 
     sub      r6, $0, #52
     add      r7, r6, #16
-	vld1.32   d8[0], [r6]
-	add      r6, r7, #16
+    vld1.32   d8[0], [r6]
+    add      r6, r7, #16
     vld1.32   d8[1], [r7]
-	add      r7, r6, #16
+    add      r7, r6, #16
     vld1.32   d9[0], [r6]
     vld1.32   d9[1], [r7]
 
 bs_mv_check_jump1:
-	vzip.32   q0, q2
-	vzip.32   q1, q3
-	vzip.32   q0, q1
+    vzip.32   q0, q2
+    vzip.32   q1, q3
+    vzip.32   q0, q1
     vzip.32   q2, q3
     BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
 .endm
@@ -1038,41 +1038,41 @@
 
 WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
 
-	stmdb sp!, {r5-r7}
-	vpush {q4}
+    stmdb sp!, {r5-r7}
+    vpush {q4}
 
-	ldr  r5, [sp, #28]	//Save BS to r5
+    ldr  r5, [sp, #28]  //Save BS to r5
 
-	/* Checking the nzc status */
-	BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
+    /* Checking the nzc status */
+    BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
 
-	/* For checking bS[I] = 2 */
-	mov      r6, #2
-	vcgt.s8  q14, q14, #0
-	vdup.u8  q0, r6
-	vcgt.s8  q15, q15, #0
+    /* For checking bS[I] = 2 */
+    mov      r6, #2
+    vcgt.s8  q14, q14, #0
+    vdup.u8  q0, r6
+    vcgt.s8  q15, q15, #0
 
-	vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
-	vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
+    vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
+    vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
 
-	/* Checking the mv status*/
-	BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
+    /* Checking the mv status*/
+    BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
 
-	/* For checking bS[I] = 1 */
+    /* For checking bS[I] = 1 */
     mov      r6, #1
-	vdup.u8  q0, r6
+    vdup.u8  q0, r6
 
-	vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
-	vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
+    vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
+    vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
 
 
-	/* Check bS[I] is '1' or '2' */
-	vmax.u8 q1, q12, q14
-	vmax.u8 q0, q13, q15
+    /* Check bS[I] is '1' or '2' */
+    vmax.u8 q1, q12, q14
+    vmax.u8 q0, q13, q15
 
-	//vstm r5, {q0, q1}
+    //vstm r5, {q0, q1}
     vst1.32 {q0, q1}, [r5]
-	vpop {q4}
-	ldmia sp!, {r5-r7}
+    vpop {q4}
+    ldmia sp!, {r5-r7}
 WELS_ASM_FUNC_END
 #endif
--- a/codec/common/arm/expand_picture_neon.S
+++ b/codec/common/arm/expand_picture_neon.S
@@ -37,119 +37,119 @@
 
 WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
     stmdb sp!, {r4-r8}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
+    //Save the dst
+    mov r7, r0
+    mov r8, r3
 
-	add r4, r7, r2
-	sub r4, #1
+    add r4, r7, r2
+    sub r4, #1
     //For the left and right expand
 _expand_picture_luma_loop2:
-	sub r5, r7, #32
-	add r6, r4, #1
+    sub r5, r7, #32
+    add r6, r4, #1
 
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
+    vld1.8 {d0[], d1[]}, [r7], r1
+    vld1.8 {d2[], d3[]}, [r4], r1
 
-	vst1.8 {q0}, [r5]!
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]!
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_luma_loop2
+    vst1.8 {q0}, [r5]!
+    vst1.8 {q0}, [r5]
+    vst1.8 {q1}, [r6]!
+    vst1.8 {q1}, [r6]
+    subs r8, #1
+    bne _expand_picture_luma_loop2
 
-	//for the top and bottom expand
-	add r2, #64
-	sub r0, #32
-	mla r4, r1, r3, r0
-	sub r4, r1
+    //for the top and bottom expand
+    add r2, #64
+    sub r0, #32
+    mla r4, r1, r3, r0
+    sub r4, r1
 _expand_picture_luma_loop0:
-	mov r5, #32
+    mov r5, #32
     mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
+    add r6, r4, r1
+    vld1.8 {q0}, [r0]!
+    vld1.8 {q1}, [r4]!
 
-	mov r8, #32
+    mov r8, #32
 _expand_picture_luma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
+    vst1.8 {q0}, [r5], r1
+    vst1.8 {q1}, [r6], r1
+    subs r8, #1
     bne _expand_picture_luma_loop1
 
-	subs r2, #16
-	bne	_expand_picture_luma_loop0
+    subs r2, #16
+    bne _expand_picture_luma_loop0
 
     //vldreq.32 d0, [r0]
 
-	ldmia sp!, {r4-r8}
+    ldmia sp!, {r4-r8}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
     stmdb sp!, {r4-r9}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
+    //Save the dst
+    mov r7, r0
+    mov r8, r3
 
-	add r4, r7, r2
-	sub r4, #1
+    add r4, r7, r2
+    sub r4, #1
     //For the left and right expand
 _expand_picture_chroma_loop2:
-	sub r5, r7, #16
-	add r6, r4, #1
+    sub r5, r7, #16
+    add r6, r4, #1
 
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
+    vld1.8 {d0[], d1[]}, [r7], r1
+    vld1.8 {d2[], d3[]}, [r4], r1
 
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_chroma_loop2
+    vst1.8 {q0}, [r5]
+    vst1.8 {q1}, [r6]
+    subs r8, #1
+    bne _expand_picture_chroma_loop2
 
-	//for the top and bottom expand
-	add r2, #32
-        mov r9, r2
-        bic r2, #15
-	sub r0, #16
-	mla r4, r1, r3, r0
-	sub r4, r1
+    //for the top and bottom expand
+    add r2, #32
+    mov r9, r2
+    bic r2, #15
+    sub r0, #16
+    mla r4, r1, r3, r0
+    sub r4, r1
 _expand_picture_chroma_loop0:
-	mov r5, #16
-        mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
+    mov r5, #16
+    mls r5, r5, r1, r0
+    add r6, r4, r1
+    vld1.8 {q0}, [r0]!
+    vld1.8 {q1}, [r4]!
 
-	mov r8, #16
+    mov r8, #16
 _expand_picture_chroma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
-        bne _expand_picture_chroma_loop1
+    vst1.8 {q0}, [r5], r1
+    vst1.8 {q1}, [r6], r1
+    subs r8, #1
+    bne _expand_picture_chroma_loop1
 
-	subs r2, #16
-	bne	_expand_picture_chroma_loop0
+    subs r2, #16
+    bne _expand_picture_chroma_loop0
 
     //vldreq.32 d0, [r0]
 
-        and r9, #15
-        cmp r9, #8
-        bne _expand_picture_chroma_end
-	mov r5, #16
-        mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {d0}, [r0]!
-	vld1.8 {d2}, [r4]!
-	mov r8, #16
+    and r9, #15
+    cmp r9, #8
+    bne _expand_picture_chroma_end
+    mov r5, #16
+    mls r5, r5, r1, r0
+    add r6, r4, r1
+    vld1.8 {d0}, [r0]!
+    vld1.8 {d2}, [r4]!
+    mov r8, #16
 _expand_picture_chroma_loop3:
-	vst1.8 {d0}, [r5], r1
-	vst1.8 {d2}, [r6], r1
-	subs r8, #1
-        bne _expand_picture_chroma_loop3
+    vst1.8 {d0}, [r5], r1
+    vst1.8 {d2}, [r6], r1
+    subs r8, #1
+    bne _expand_picture_chroma_loop3
 _expand_picture_chroma_end:
 
-	ldmia sp!, {r4-r9}
+    ldmia sp!, {r4-r9}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/common/arm/mc_neon.S
+++ b/codec/common/arm/mc_neon.S
@@ -35,2176 +35,2176 @@
 #include "arm_arch_common_macro.S"
 
 #ifdef __APPLE__
-.macro	AVERAGE_TWO_8BITS
-//	{	// input:dst_d, src_d A and B; working: q13
-    vaddl.u8	q13, $2, $1
-    vrshrn.u16		$0, q13, #1
-//	}
+.macro AVERAGE_TWO_8BITS
+//  {   // input:dst_d, src_d A and B; working: q13
+    vaddl.u8    q13, $2, $1
+    vrshrn.u16      $0, q13, #1
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-//	}
+.macro FILTER_6TAG_8BITS
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        $6, q12, #5
+//  }
 .endm
 
-.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
-    vrev64.8	$2, $0				// X[5][4][3][2][1][0]O
-    vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16	$0, $2, $1			// 0+1*[50]-5*[41]+20[32]
-    vpadd.s16	$0, $0, $0
-    vpadd.s16	$0, $0, $0
-    vqrshrun.s16	$0, $4, #5
-//	}
+.macro FILTER_SINGLE_TAG_8BITS      // when width=17/9, used
+//  {   // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+    vrev64.8    $2, $0              // X[5][4][3][2][1][0]O
+    vaddl.u8    $3, $0, $2          // each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16    $0, $2, $1          // 0+1*[50]-5*[41]+20[32]
+    vpadd.s16   $0, $0, $0
+    vpadd.s16   $0, $0, $0
+    vqrshrun.s16    $0, $4, #5
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-    vaddl.u8	q13, $2, $6
-    vrshrn.u16		$6, q13, #1
-//	}
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        $6, q12, #5
+    vaddl.u8    q13, $2, $6
+    vrshrn.u16      $6, q13, #1
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-    vaddl.u8	q13, $3, $6
-    vrshrn.u16		$6, q13, #1
-//	}
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, $0, $5 //q12=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    q12, q13, $7    //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    q12, q13, $8    //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        $6, q12, #5
+    vaddl.u8    q13, $3, $6
+    vrshrn.u16      $6, q13, #1
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_TO_16BITS
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+.macro FILTER_6TAG_8BITS_TO_16BITS
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+    vaddl.u8    $6, $0, $5      //dst_q=src[-2]+src[3]
+    vaddl.u8    q13, $2, $3 //src[0]+src[1]
+    vmla.u16    $6, q13, $7 //dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, $1, $4 //src[-1]+src[2]
+    vmls.s16    $6, q13, $8 //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
-.macro	FILTER_3_IN_16BITS_TO_8BITS
-//	{	// input:a, b, c, dst_d;
-    vsub.s16	$0, $0, $1			//a-b
-    vshr.s16	$0, $0, #2			//(a-b)/4
-    vsub.s16	$0, $0, $1			//(a-b)/4-b
-    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	$3, $0, #6		//(+32)>>6
-//	}
+.macro FILTER_3_IN_16BITS_TO_8BITS
+//  {   // input:a, b, c, dst_d;
+    vsub.s16    $0, $0, $1          //a-b
+    vshr.s16    $0, $0, #2          //(a-b)/4
+    vsub.s16    $0, $0, $1          //(a-b)/4-b
+    vadd.s16    $0, $0, $2          //(a-b)/4-b+c
+    vshr.s16    $0, $0, #2          //((a-b)/4-b+c)/4
+    vadd.s16    $0, $0, $2          //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    $3, $0, #6      //(+32)>>6
+//  }
 .endm
 
-.macro	UNPACK_2_16BITS_TO_ABC
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16	$4, $0, $1, #2		//src[0]
-    vext.16	$3, $0, $1, #3		//src[1]
-    vadd.s16	$4, $3					//c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    vext.16 $4, $0, $1, #2      //src[0]
+    vext.16 $3, $0, $1, #3      //src[1]
+    vadd.s16    $4, $3                  //c=src[0]+src[1]
 
-    vext.16	$3, $0, $1, #1		//src[-1]
-    vext.16	$2, $0, $1, #4		//src[2]
-    vadd.s16	$3, $2					//b=src[-1]+src[2]
+    vext.16 $3, $0, $1, #1      //src[-1]
+    vext.16 $2, $0, $1, #4      //src[2]
+    vadd.s16    $3, $2                  //b=src[-1]+src[2]
 
-    vext.16	$2, $0, $1, #5		//src[3]
-    vadd.s16	$2, $0					//a=src[-2]+src[3]
-//	}
+    vext.16 $2, $0, $1, #5      //src[3]
+    vadd.s16    $2, $0                  //a=src[-2]+src[3]
+//  }
 .endm
 
-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],
-    vrev64.16	$1, $1
-    vadd.u16	$2, $1				// C[2+3],B[1+4],A[0+5],
-    vshr.s64	$1, $2, #16
-    vshr.s64	$0, $2, #32		// Output: C $2, B $1, A $0
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS
+//  {   // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16 $3, $3, $3, #7  // 0x????, [0][1][2][3][4][5],
+    vrev64.16   $1, $1
+    vadd.u16    $2, $1              // C[2+3],B[1+4],A[0+5],
+    vshr.s64    $1, $2, #16
+    vshr.s64    $0, $2, #32     // Output: C $2, B $1, A $0
 
-    vsub.s16	$0, $0, $1			//a-b
-    vshr.s16	$0, $0, #2			//(a-b)/4
-    vsub.s16	$0, $0, $1			//(a-b)/4-b
-    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	$1, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	$0, $3, #6		//(+32)>>6
-//	}
+    vsub.s16    $0, $0, $1          //a-b
+    vshr.s16    $0, $0, #2          //(a-b)/4
+    vsub.s16    $0, $0, $1          //(a-b)/4-b
+    vadd.s16    $0, $0, $2          //(a-b)/4-b+c
+    vshr.s16    $0, $0, #2          //((a-b)/4-b+c)/4
+    vadd.s16    $1, $0, $2          //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    $0, $3, #6      //(+32)>>6
+//  }
 .endm
 #else
-.macro	AVERAGE_TWO_8BITS arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: q13
-    vaddl.u8	q13, \arg2, \arg1
-    vrshrn.u16		\arg0, q13, #1
-//	}
+.macro AVERAGE_TWO_8BITS arg0, arg1, arg2
+//  {   // input:dst_d, src_d A and B; working: q13
+    vaddl.u8    q13, \arg2, \arg1
+    vrshrn.u16      \arg0, q13, #1
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-//	}
+.macro FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, \arg0, \arg5   //q12=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        \arg6, q12, #5
+//  }
 .endm
 
-.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
-    vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O
-    vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16	\arg0, \arg2, \arg1			// 0+1*[50]-5*[41]+20[32]
-    vpadd.s16	\arg0, \arg0, \arg0
-    vpadd.s16	\arg0, \arg0, \arg0
-    vqrshrun.s16	\arg0, \arg4, #5
-//	}
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5     // when width=17/9, used
+//  {   // input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+    vrev64.8    \arg2, \arg0                // X[5][4][3][2][1][0]O
+    vaddl.u8    \arg3, \arg0, \arg2         // each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16    \arg0, \arg2, \arg1         // 0+1*[50]-5*[41]+20[32]
+    vpadd.s16   \arg0, \arg0, \arg0
+    vpadd.s16   \arg0, \arg0, \arg0
+    vqrshrun.s16    \arg0, \arg4, #5
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-    vaddl.u8	q13, \arg2, \arg6
-    vrshrn.u16		\arg6, q13, #1
-//	}
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, \arg0, \arg5   //q12=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        \arg6, q12, #5
+    vaddl.u8    q13, \arg2, \arg6
+    vrshrn.u16      \arg6, q13, #1
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-    vaddl.u8	q13, \arg3, \arg6
-    vrshrn.u16		\arg6, q13, #1
-//	}
+.macro FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8    q12, \arg0, \arg5   //q12=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    q12, q13, \arg7 //q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    q12, q13, \arg8 //q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16        \arg6, q12, #5
+    vaddl.u8    q13, \arg3, \arg6
+    vrshrn.u16      \arg6, q13, #1
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+.macro FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+    vaddl.u8    \arg6, \arg0, \arg5     //dst_q=src[-2]+src[3]
+    vaddl.u8    q13, \arg2, \arg3   //src[0]+src[1]
+    vmla.u16    \arg6, q13, \arg7   //dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8    q13, \arg1, \arg4   //src[-1]+src[2]
+    vmls.s16    \arg6, q13, \arg8   //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
-.macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    vsub.s16	\arg0, \arg0, \arg1			//a-b
-    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
-//	}
+.macro FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
+//  {   // input:a, b, c, dst_d;
+    vsub.s16    \arg0, \arg0, \arg1         //a-b
+    vshr.s16    \arg0, \arg0, #2            //(a-b)/4
+    vsub.s16    \arg0, \arg0, \arg1         //(a-b)/4-b
+    vadd.s16    \arg0, \arg0, \arg2         //(a-b)/4-b+c
+    vshr.s16    \arg0, \arg0, #2            //((a-b)/4-b+c)/4
+    vadd.s16    \arg0, \arg0, \arg2         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    \arg3, \arg0, #6        //(+32)>>6
+//  }
 .endm
 
-.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16	\arg4, \arg0, \arg1, #2		//src[0]
-    vext.16	\arg3, \arg0, \arg1, #3		//src[1]
-    vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    vext.16 \arg4, \arg0, \arg1, #2     //src[0]
+    vext.16 \arg3, \arg0, \arg1, #3     //src[1]
+    vadd.s16    \arg4, \arg3                    //c=src[0]+src[1]
 
-    vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
-    vext.16	\arg2, \arg0, \arg1, #4		//src[2]
-    vadd.s16	\arg3,\arg2					//b=src[-1]+src[2]
+    vext.16 \arg3, \arg0, \arg1, #1     //src[-1]
+    vext.16 \arg2, \arg0, \arg1, #4     //src[2]
+    vadd.s16    \arg3,\arg2                 //b=src[-1]+src[2]
 
-    vext.16	\arg2, \arg0, \arg1, #5		//src[3]
-    vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
-//	}
+    vext.16 \arg2, \arg0, \arg1, #5     //src[3]
+    vadd.s16    \arg2, \arg0                    //a=src[-2]+src[3]
+//  }
 .endm
 
-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]
-    vrev64.16	\arg1, \arg1
-    vadd.u16	\arg2, \arg1				// C[2+3],B[1+4],A[0+5]
-    vshr.s64	\arg1, \arg2, #16
-    vshr.s64	\arg0, \arg2, #32		// Output: C \arg2, B \arg1, A \arg0
+.macro UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+//  {   // each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16 \arg3, \arg3, \arg3, #7 // 0x????, [0][1][2][3][4][5]
+    vrev64.16   \arg1, \arg1
+    vadd.u16    \arg2, \arg1                // C[2+3],B[1+4],A[0+5]
+    vshr.s64    \arg1, \arg2, #16
+    vshr.s64    \arg0, \arg2, #32       // Output: C \arg2, B \arg1, A \arg0
 
-    vsub.s16	\arg0, \arg0, \arg1			//a-b
-    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	\arg1, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	\arg0, \arg3, #6		//(+32)>>6
-//	}
+    vsub.s16    \arg0, \arg0, \arg1         //a-b
+    vshr.s16    \arg0, \arg0, #2            //(a-b)/4
+    vsub.s16    \arg0, \arg0, \arg1         //(a-b)/4-b
+    vadd.s16    \arg0, \arg0, \arg2         //(a-b)/4-b+c
+    vshr.s16    \arg0, \arg0, #2            //((a-b)/4-b+c)/4
+    vadd.s16    \arg1, \arg0, \arg2         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16    \arg0, \arg3, #6        //(+32)>>6
+//  }
 .endm
 #endif
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w16_h_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 21(16+5); q0=src[-2]
+    pld         [r0]
+    pld         [r0, #16]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+    FILTER_6TAG_8BITS   d0, d4, d6, d16, d18, d20, d2, q14, q15
 
-	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+    FILTER_6TAG_8BITS   d1, d5, d7, d17, d19, d21, d3, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+    sub     r4, #1
+    vst1.u8 {d2, d3}, [r2], r3      //write 16Byte
 
-	cmp		r4, #0
-	bne		w16_h_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w16_h_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w8_h_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 13(8+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+    FILTER_6TAG_8BITS   d0, d2, d3, d4, d5, d6, d1, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
+    sub     r4, #1
+    vst1.u8 {d1}, [r2], r3
 
-	cmp		r4, #0
-	bne		w8_h_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w8_h_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r6, [sp, #12]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w4_h_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
+    vld1.u8 {d0, d1}, [r0], r1  //only use 9(4+5);d0: 1st row src[-2:5]
+    pld         [r0]
+    vld1.u8 {d2, d3}, [r0], r1  //d2: 2nd row src[-2:5]
+    pld         [r0]
 
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+    vext.8      d4, d0, d1, #1      //d4: 1st row src[-1:6]
+    vext.8      d5, d2, d3, #1      //d5: 2nd row src[-1:6]
+    vext.8      q3, q2, q2, #1      //src[0:6 *]
+    vext.8      q8, q2, q2, #2      //src[1:6 * *]
 
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+    vtrn.32 q3, q8                  //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+    vtrn.32 d6, d7                  //d6:[0:3]; d7[1:4]
+    vtrn.32     d0, d2              //d0:[-2:1]; d2[2:5]
+    vtrn.32     d4, d5              //d4:[-1:2]; d5[3:6]
 
-	FILTER_6TAG_8BITS 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+    FILTER_6TAG_8BITS   d0, d4, d6, d7, d2, d5, d1, q14, q15
 
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vmov        r4, r5, d1
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_h_mc_luma_loop
+    sub     r6, #2
+    cmp     r6, #0
+    bne     w4_h_mc_luma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w16_xy_10_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 21(16+5); q0=src[-2]
+    pld         [r0]
+    pld         [r0, #16]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d4, d6, d16, d18, d20, d2, q14, q15
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d5, d7, d17, d19, d21, d3, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+    sub     r4, #1
+    vst1.u8 {d2, d3}, [r2], r3      //write 16Byte
 
-	cmp		r4, #0
-	bne		w16_xy_10_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w16_xy_10_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w8_xy_10_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 13(8+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d2, d3, d4, d5, d6, d1, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
+    sub     r4, #1
+    vst1.u8 {d1}, [r2], r3
 
-	cmp		r4, #0
-	bne		w8_xy_10_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w8_xy_10_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r6, [sp, #12]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w4_xy_10_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
+    vld1.u8 {d0, d1}, [r0], r1  //only use 9(4+5);d0: 1st row src[-2:5]
+    pld         [r0]
+    vld1.u8 {d2, d3}, [r0], r1  //d2: 2nd row src[-2:5]
+    pld         [r0]
 
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+    vext.8      d4, d0, d1, #1      //d4: 1st row src[-1:6]
+    vext.8      d5, d2, d3, #1      //d5: 2nd row src[-1:6]
+    vext.8      q3, q2, q2, #1      //src[0:6 *]
+    vext.8      q8, q2, q2, #2      //src[1:6 * *]
 
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+    vtrn.32 q3, q8                  //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+    vtrn.32 d6, d7                  //d6:[0:3]; d7[1:4]
+    vtrn.32     d0, d2              //d0:[-2:1]; d2[2:5]
+    vtrn.32     d4, d5              //d4:[-1:2]; d5[3:6]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d4, d6, d7, d2, d5, d1, q14, q15
 
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vmov        r4, r5, d1
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_10_mc_luma_loop
+    sub     r6, #2
+    cmp     r6, #0
+    bne     w4_xy_10_mc_luma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w16_xy_30_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 21(16+5); q0=src[-2]
+    pld         [r0]
+    pld         [r0, #16]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d4, d6, d16, d18, d20, d2, q14, q15
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d5, d7, d17, d19, d21, d3, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+    sub     r4, #1
+    vst1.u8 {d2, d3}, [r2], r3      //write 16Byte
 
-	cmp		r4, #0
-	bne		w16_xy_30_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w16_xy_30_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w8_xy_30_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 13(8+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d2, d3, d4, d5, d6, d1, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
+    sub     r4, #1
+    vst1.u8 {d1}, [r2], r3
 
-	cmp		r4, #0
-	bne		w8_xy_30_mc_luma_loop
-	pop		{r4}
+    cmp     r4, #0
+    bne     w8_xy_30_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r6, [sp, #12]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w4_xy_30_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
+    vld1.u8 {d0, d1}, [r0], r1  //only use 9(4+5);d0: 1st row src[-2:5]
+    pld         [r0]
+    vld1.u8 {d2, d3}, [r0], r1  //d2: 2nd row src[-2:5]
+    pld         [r0]
 
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+    vext.8      d4, d0, d1, #1      //d4: 1st row src[-1:6]
+    vext.8      d5, d2, d3, #1      //d5: 2nd row src[-1:6]
+    vext.8      q3, q2, q2, #1      //src[0:6 *]
+    vext.8      q8, q2, q2, #2      //src[1:6 * *]
 
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+    vtrn.32 q3, q8                  //q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+    vtrn.32 d6, d7                  //d6:[0:3]; d7[1:4]
+    vtrn.32     d0, d2              //d0:[-2:1]; d2[2:5]
+    vtrn.32     d4, d5              //d4:[-1:2]; d5[3:6]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d4, d6, d7, d2, d5, d1, q14, q15
 
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vmov        r4, r5, d1
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_30_mc_luma_loop
+    sub     r6, #2
+    cmp     r6, #0
+    bne     w4_xy_30_mc_luma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w16_xy_01_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q4
+    //q2, q3, q4, q5, q0 --> q0~q4
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q4
 
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_xy_01_luma_loop
-	pop		{r4}
+    sub     r4, #8
+    cmp     r4, #0
+    bne     w16_xy_01_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w8_xy_01_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_xy_01_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_xy_01_mc_luma_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
+    push        {r4, r5, r6, r7}
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    ldr     r4, [r0], r1        //r4=src[-2]
+    ldr     r5, [r0], r1        //r5=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    ldr     r6, [r0], r1        //r6=src[0]
+    ldr     r7, [r0], r1        //r7=src[1]
 
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
+    vmov        d0, r4, r5
+    vmov        d1, r5, r6
+    vmov        d2, r6, r7
 
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d3, r7, r4
+    ldr         r7, [sp, #16]
 
 w4_xy_01_mc_luma_loop:
 
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
+//  pld         [r0]
+    //using reserving r4
+    ldr     r5, [r0], r1        //r5=src[3]
+    ldr     r6, [r0], r1        //r6=src[0]
+    vmov        d4, r4, r5
+    vmov        d5, r5, r6          //reserved r6
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vmov        r4, r5, d16
+    str r4, [r2], r3            //write 1st 4Byte
+    str r5, [r2], r3            //write 2nd 4Byte
 
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
+    ldr     r5, [r0], r1        //r5=src[1]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d0, r6, r5
+    vmov        d1, r5, r4          //reserved r4
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_0    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vmov        r5, r6, d16
+    str r5, [r2], r3            //write 3rd 4Byte
+    str r6, [r2], r3            //write 4th 4Byte
 
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
+    //d4, d5, d0, d1 --> d0, d1, d2, d3
+    vmov    q1, q0
+    vmov    q0, q2
 
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_xy_01_mc_luma_loop
+    sub     r7, #4
+    cmp     r7, #0
+    bne     w4_xy_01_mc_luma_loop
 
-	pop		{r4, r5, r6, r7}
+    pop     {r4, r5, r6, r7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w16_xy_03_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
+    //q2, q3, q8, q9, q0 --> q0~q8
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q8
 
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_xy_03_luma_loop
-	pop		{r4}
+    sub     r4, #8
+    cmp     r4, #0
+    bne     w16_xy_03_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w8_xy_03_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_xy_03_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_xy_03_mc_luma_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
+    push        {r4, r5, r6, r7}
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    ldr     r4, [r0], r1        //r4=src[-2]
+    ldr     r5, [r0], r1        //r5=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    ldr     r6, [r0], r1        //r6=src[0]
+    ldr     r7, [r0], r1        //r7=src[1]
 
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
+    vmov        d0, r4, r5
+    vmov        d1, r5, r6
+    vmov        d2, r6, r7
 
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d3, r7, r4
+    ldr         r7, [sp, #16]
 
 w4_xy_03_mc_luma_loop:
 
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
+//  pld         [r0]
+    //using reserving r4
+    ldr     r5, [r0], r1        //r5=src[3]
+    ldr     r6, [r0], r1        //r6=src[0]
+    vmov        d4, r4, r5
+    vmov        d5, r5, r6          //reserved r6
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vmov        r4, r5, d16
+    str r4, [r2], r3            //write 1st 4Byte
+    str r5, [r2], r3            //write 2nd 4Byte
 
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
+    ldr     r5, [r0], r1        //r5=src[1]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d0, r6, r5
+    vmov        d1, r5, r4          //reserved r4
 
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
+    FILTER_6TAG_8BITS_AVERAGE_WITH_1    d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vmov        r5, r6, d16
+    str r5, [r2], r3            //write 3rd 4Byte
+    str r6, [r2], r3            //write 4th 4Byte
 
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
+    //d4, d5, d0, d1 --> d0, d1, d2, d3
+    vmov    q1, q0
+    vmov    q0, q2
 
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_xy_03_mc_luma_loop
+    sub     r7, #4
+    cmp     r7, #0
+    bne     w4_xy_03_mc_luma_loop
 
-	pop		{r4, r5, r6, r7}
+    pop     {r4, r5, r6, r7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w16_v_mc_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS   d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS   d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS   d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS   d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
+    //q2, q3, q8, q9, q0 --> q0~q8
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q8
 
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_v_mc_luma_loop
-	pop		{r4}
+    sub     r4, #8
+    cmp     r4, #0
+    bne     w16_v_mc_luma_loop
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w8_v_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_v_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_v_mc_luma_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
+    push        {r4, r5, r6, r7}
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    ldr     r4, [r0], r1        //r4=src[-2]
+    ldr     r5, [r0], r1        //r5=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    ldr     r6, [r0], r1        //r6=src[0]
+    ldr     r7, [r0], r1        //r7=src[1]
 
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
+    vmov        d0, r4, r5
+    vmov        d1, r5, r6
+    vmov        d2, r6, r7
 
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d3, r7, r4
+    ldr         r7, [sp, #16]
 
 w4_v_mc_luma_loop:
 
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
+//  pld         [r0]
+    //using reserving r4
+    ldr     r5, [r0], r1        //r5=src[3]
+    ldr     r6, [r0], r1        //r6=src[0]
+    vmov        d4, r4, r5
+    vmov        d5, r5, r6          //reserved r6
 
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vmov        r4, r5, d16
+    str r4, [r2], r3            //write 1st 4Byte
+    str r5, [r2], r3            //write 2nd 4Byte
 
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
+    ldr     r5, [r0], r1        //r5=src[1]
+    ldr     r4, [r0], r1        //r4=src[2]
+    vmov        d0, r6, r5
+    vmov        d1, r5, r4          //reserved r4
 
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
+    FILTER_6TAG_8BITS   d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vmov        r5, r6, d16
+    str r5, [r2], r3            //write 3rd 4Byte
+    str r6, [r2], r3            //write 4th 4Byte
 
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
+    //d4, d5, d0, d1 --> d0, d1, d2, d3
+    vmov    q1, q0
+    vmov    q0, q2
 
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_v_mc_luma_loop
+    sub     r7, #4
+    cmp     r7, #0
+    bne     w4_v_mc_luma_loop
 
-	pop		{r4, r5, r6, r7}
+    pop     {r4, r5, r6, r7}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
-	push		{r4}
-	vpush		{q4-q7}
-	ldr			r4, [sp, #68]
+    push        {r4}
+    vpush       {q4-q7}
+    ldr         r4, [sp, #68]
 
-	sub			r0, #2					//src[-2]
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2                  //src[-2]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0-d2}, [r0], r1       //use 21(16+5), =src[-2]
+    vld1.u8 {d3-d5}, [r0], r1       //use 21(16+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
 
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(16+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(16+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(16+5), =src[2]
+    vld1.u8 {d6-d8}, [r0], r1       //use 21(16+5), =src[0]
+    vld1.u8 {d9-d11}, [r0], r1  //use 21(16+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {d12-d14}, [r0], r1 //use 21(16+5), =src[2]
 
 w16_hv_mc_luma_loop:
 
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(16+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+    vld1.u8 {d15-d17}, [r0], r1 //use 21(16+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d3, d6, d9, d12, d15, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d4, d7,d10, d13, d16,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0   //output to q0[0]
 
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2], r3		//write 16Byte
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d5, d8,d11, d14, d17,q11, q14, q15  // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1    //output to q0[1]
+    vst1.u8 {q0}, [r2], r3      //write 16Byte
 
 
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
+    vld1.u8 {d0-d2}, [r0], r1       //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d6, d9, d12, d15, d0, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d7,d10, d13, d16, d1,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3   //output to d3
 
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d8,d11, d14, d17, d2,q11, q14, q15  // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4    //output to d4
 
-	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte
+    vst1.u8 {d3, d4}, [r2], r3      //write 16Byte
 
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
+    vld1.u8 {d3-d5}, [r0], r1       //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d9, d12, d15, d0, d3, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7,d10, d13, d16, d1, d4,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6   //output to d6
 
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d8,d11, d14, d17, d2, d5,q11, q14, q15  // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7    //output to d7
+    vst1.u8 {d6, d7}, [r2], r3      //write 16Byte
 
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
+    vld1.u8 {d6-d8}, [r0], r1       //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS      d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9   //output to d9
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10   //output to d10
+    vst1.u8 {d9, d10}, [r2], r3     //write 16Byte
 
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2
-	vmov	q2, q8
+    //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+    vswp    q0, q6
+    vswp    q6, q3
+    vmov    q5, q2
+    vmov    q2, q8
 
-	vmov	d20,d8
-	vmov	q4, q1
-	vmov	q1, q7
-	vmov	d14,d20
+    vmov    d20,d8
+    vmov    q4, q1
+    vmov    q1, q7
+    vmov    d14,d20
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w16_hv_mc_luma_loop
-	vpop		{q4-q7}
-	pop		{r4}
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w16_hv_mc_luma_loop
+    vpop        {q4-q7}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
-	push		{r4}
-	vpush		{q4}
-	ldr			r4, [sp, #20]
+    push        {r4}
+    vpush       {q4}
+    ldr         r4, [sp, #20]
 
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2              //src[-2]
+    sub         r0, r0, r1, lsl #1  //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 13(8+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 13(8+5), =src[-1]
+    vmov.u16    q14, #0x0014        // 20
+    vld1.u8 {q0}, [r0], r1  //use 13(8+5), =src[-2]
+    vld1.u8 {q1}, [r0], r1  //use 13(8+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2        // 5
 
-	vld1.u8	{q2}, [r0], r1	//use 13(8+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 13(8+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 13(8+5), =src[2]
+    vld1.u8 {q2}, [r0], r1  //use 13(8+5), =src[0]
+    vld1.u8 {q3}, [r0], r1  //use 13(8+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {q4}, [r0], r1  //use 13(8+5), =src[2]
 
 w8_hv_mc_luma_loop:
 
-	vld1.u8	{q8}, [r0], r1	//use 13(8+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
+    vld1.u8 {q8}, [r0], r1  //use 13(8+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d16, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d17, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3           //write 8Byte
 
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3		//write 8Byte
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d4, d6, d8, d16, d0, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d5, d7, d9, d17, d1, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3       //write 8Byte
 
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d6, d8, d16, d0, d2, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d7, d9, d17, d1, d3, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3           //write 8Byte
 
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d8, d16, d0, d2, d4, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7, d9, d17, d1, d3, d5, q10, q14, q15  // 5 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2], r3           //write 8Byte
 
-	//q4~q5, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1
-	vmov	q1, q8
+    //q4~q5, q0~q2, --> q0~q4
+    vswp    q0, q4
+    vswp    q2, q4
+    vmov    q3, q1
+    vmov    q1, q8
 
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_hv_mc_luma_loop
-	vpop		{q4}
-	pop		{r4}
+    sub     r4, #4
+    cmp     r4, #0
+    bne     w8_hv_mc_luma_loop
+    vpop        {q4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
-	push		{r4 ,r5, r6}
-	vpush		{q4-q7}
-	ldr			r6, [sp, #76]
+    push        {r4 ,r5, r6}
+    vpush       {q4-q7}
+    ldr         r6, [sp, #76]
 
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2              //src[-2]
+    sub         r0, r0, r1, lsl #1  //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[-1]
+    vmov.u16    q14, #0x0014        // 20
+    vld1.u8 {q0}, [r0], r1  //use 9(4+5), =src[-2]
+    vld1.u8 {q1}, [r0], r1  //use 9(4+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2        // 5
 
-	vld1.u8	{q2}, [r0], r1	//use 9(4+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 9(4+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 9(4+5), =src[2]
+    vld1.u8 {q2}, [r0], r1  //use 9(4+5), =src[0]
+    vld1.u8 {q3}, [r0], r1  //use 9(4+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {q4}, [r0], r1  //use 9(4+5), =src[2]
 
 w4_hv_mc_luma_loop:
 
-	vld1.u8	{q5}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q6}, [r0], r1	//use 9(4+5), =src[4]
+    vld1.u8 {q5}, [r0], r1  //use 9(4+5), =src[3]
+    vld1.u8 {q6}, [r0], r1  //use 9(4+5), =src[4]
 
-	//the 1st&2nd row
-	pld			[r0]
-	pld			[r0, r1]
-	// vertical filtered
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q8, q14, q15	// 1 avail
+    //the 1st&2nd row
+    pld         [r0]
+    pld         [r0, r1]
+    // vertical filtered
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d10, q7, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d11, q8, q14, q15   // 1 avail
 
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8,d10, d12, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9,d11, d13,q10, q14, q15	// 1 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q0, q7, q8		//4 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d4, d6, d8,d10, d12, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d5, d7, d9,d11, d13,q10, q14, q15   // 1 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q7, q8, q11, q12, q13   //4 avail
+    UNPACK_2_16BITS_TO_ABC  q9,q10, q0, q7, q8      //4 avail
 
-	vmov	d23, d0
-	vmov	d25, d14
-	vmov	d27, d16
+    vmov    d23, d0
+    vmov    d25, d14
+    vmov    d27, d16
 
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22  //output to q11[0]
+    vmov        r4 ,r5, d22
+    str     r4, [r2], r3                //write 4Byte
+    str     r5, [r2], r3                //write 4Byte
 
-	//the 3rd&4th row
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[4]
-	pld			[r0]
-	pld			[r0, r1]
-	// vertical filtered
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d12, d0, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d13, d1, q8, q14, q15	// 1 avail
+    //the 3rd&4th row
+    vld1.u8 {q0}, [r0], r1  //use 9(4+5), =src[3]
+    vld1.u8 {q1}, [r0], r1  //use 9(4+5), =src[4]
+    pld         [r0]
+    pld         [r0, r1]
+    // vertical filtered
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d6, d8, d10, d12, d0, q7, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d7, d9, d11, d13, d1, q8, q14, q15  // 1 avail
 
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8,d10, d12, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9,d11, d13, d1, d3,q10, q14, q15	// 1 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q2, q7, q8		//4 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d8,d10, d12, d0, d2, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7, d9,d11, d13, d1, d3,q10, q14, q15   // 1 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q7, q8, q11, q12, q13   //4 avail
+    UNPACK_2_16BITS_TO_ABC  q9,q10, q2, q7, q8      //4 avail
 
-	vmov	d23, d4
-	vmov	d25, d14
-	vmov	d27, d16
+    vmov    d23, d4
+    vmov    d25, d14
+    vmov    d27, d16
 
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22  //output to q11[0]
+    vmov        r4 ,r5, d22
+    str     r4, [r2], r3                //write 4Byte
+    str     r5, [r2], r3                //write 4Byte
 
-	//q4~q6, q0~q1, --> q0~q4
-	vswp	q4, q0
-	vmov	q3, q4
-	vmov	q4, q1
-	vmov	q1, q5
-	vmov	q2, q6
+    //q4~q6, q0~q1, --> q0~q4
+    vswp    q4, q0
+    vmov    q3, q4
+    vmov    q4, q1
+    vmov    q1, q5
+    vmov    q2, q6
 
-	sub		r6, #4
-	cmp		r6, #0
-	bne		w4_hv_mc_luma_loop
+    sub     r6, #4
+    cmp     r6, #0
+    bne     w4_hv_mc_luma_loop
 
-	vpop		{q4-q7}
-	pop		{r4, r5, r6}
+    vpop        {q4-q7}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 w16_copy_loop:
-	vld1.u8		{q0}, [r0], r1
-	sub			r4, #2
-	vld1.u8		{q1}, [r0], r1
-	vst1.u8		{q0}, [r2], r3
-	cmp			r4, #0
-	vst1.u8		{q1}, [r2], r3
-	bne			w16_copy_loop
+    vld1.u8     {q0}, [r0], r1
+    sub         r4, #2
+    vld1.u8     {q1}, [r0], r1
+    vst1.u8     {q0}, [r2], r3
+    cmp         r4, #0
+    vst1.u8     {q1}, [r2], r3
+    bne         w16_copy_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 w8_copy_loop:
-	vld1.u8		{d0}, [r0], r1
-	vld1.u8		{d1}, [r0], r1
-	vst1.u8		{d0}, [r2], r3
-	vst1.u8		{d1}, [r2], r3
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w8_copy_loop
+    vld1.u8     {d0}, [r0], r1
+    vld1.u8     {d1}, [r0], r1
+    vst1.u8     {d0}, [r2], r3
+    vst1.u8     {d1}, [r2], r3
+    sub         r4, #2
+    cmp         r4, #0
+    bne         w8_copy_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
 w4_copy_loop:
-	ldr		r5, [r0], r1
-	ldr		r6, [r0], r1
-	str		r5, [r2], r3
-	str		r6, [r2], r3
+    ldr     r5, [r0], r1
+    ldr     r6, [r0], r1
+    str     r5, [r2], r3
+    str     r6, [r2], r3
 
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_copy_loop
+    sub         r4, #2
+    cmp         r4, #0
+    bne         w4_copy_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2]!
-	vld1.u8		{q1}, [r3]!
-	vld1.u8		{q2}, [r2]!
-	vld1.u8		{q3}, [r3]!
+    vld1.u8     {q0}, [r2]!
+    vld1.u8     {q1}, [r3]!
+    vld1.u8     {q2}, [r2]!
+    vld1.u8     {q3}, [r3]!
 
-	vld1.u8		{q8}, [r2]!
-	vld1.u8		{q9}, [r3]!
-	vld1.u8		{q10}, [r2]!
-	vld1.u8		{q11}, [r3]!
+    vld1.u8     {q8}, [r2]!
+    vld1.u8     {q9}, [r3]!
+    vld1.u8     {q10}, [r2]!
+    vld1.u8     {q11}, [r3]!
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {q0}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {q2}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d16, d16, d18
-	AVERAGE_TWO_8BITS		d17, d17, d19
-	vst1.u8		{q8}, [r0], r1
+    AVERAGE_TWO_8BITS       d16, d16, d18
+    AVERAGE_TWO_8BITS       d17, d17, d19
+    vst1.u8     {q8}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d20, d20, d22
-	AVERAGE_TWO_8BITS		d21, d21, d23
-	vst1.u8		{q10}, [r0], r1
+    AVERAGE_TWO_8BITS       d20, d20, d22
+    AVERAGE_TWO_8BITS       d21, d21, d23
+    vst1.u8     {q10}, [r0], r1
 
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w16_pix_avg_loop
+    sub         r4, #4
+    cmp         r4, #0
+    bne         w16_pix_avg_loop
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	mov			r5, #16
+    push        {r4, r5}
+    ldr         r4, [sp, #8]
+    mov         r5, #16
 w8_pix_avg_loop:
 
-	vld1.u8		{d0}, [r2], r5
-	vld1.u8		{d2}, [r3], r5
-	vld1.u8		{d1}, [r2], r5
-	vld1.u8		{d3}, [r3], r5
+    vld1.u8     {d0}, [r2], r5
+    vld1.u8     {d2}, [r3], r5
+    vld1.u8     {d1}, [r2], r5
+    vld1.u8     {d3}, [r3], r5
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {d0}, [r0], r1
+    vst1.u8     {d1}, [r0], r1
 
-	vld1.u8		{d4}, [r2], r5
-	vld1.u8		{d6}, [r3], r5
-	vld1.u8		{d5}, [r2], r5
-	vld1.u8		{d7}, [r3], r5
+    vld1.u8     {d4}, [r2], r5
+    vld1.u8     {d6}, [r3], r5
+    vld1.u8     {d5}, [r2], r5
+    vld1.u8     {d7}, [r3], r5
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {d4}, [r0], r1
+    vst1.u8     {d5}, [r0], r1
 
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w8_pix_avg_loop
+    sub         r4, #4
+    cmp         r4, #0
+    bne         w8_pix_avg_loop
 
-	pop		{r4, r5}
+    pop     {r4, r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
-	push		{r4-r8}
-	ldr			r4, [sp, #20]
+    push        {r4-r8}
+    ldr         r4, [sp, #20]
 w4_pix_avg_loop:
 
-	ldr		r5, [r2]
-	ldr		r6, [r2, #16]
-	ldr		r7, [r3]
-	ldr		r8, [r3, #16]
-	add		r2, #32
-	add		r3, #32
+    ldr     r5, [r2]
+    ldr     r6, [r2, #16]
+    ldr     r7, [r3]
+    ldr     r8, [r3, #16]
+    add     r2, #32
+    add     r3, #32
 
-	vmov		d0, r5, r6
-	vmov		d1, r7, r8
-	AVERAGE_TWO_8BITS		d0, d0, d1
-	vmov		r5, r6, d0
+    vmov        d0, r5, r6
+    vmov        d1, r7, r8
+    AVERAGE_TWO_8BITS       d0, d0, d1
+    vmov        r5, r6, d0
 
-	str		r5, [r0], r1
-	str		r6, [r0], r1
+    str     r5, [r0], r1
+    str     r6, [r0], r1
 
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_pix_avg_loop
+    sub         r4, #2
+    cmp         r4, #0
+    bne         w4_pix_avg_loop
 
-	pop		{r4-r8}
+    pop     {r4-r8}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	ldr			r5, [sp, #12]
-//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-//	we can opti it by adding vert only/ hori only cases, to be continue
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
-	vld1.u8		{q0}, [r0], r1	//src[x]
+    push        {r4, r5}
+    ldr         r4, [sp, #8]
+    ldr         r5, [sp, #12]
+//  normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+//  we can opti it by adding vert only/ hori only cases, to be continue
+    vld1.u8 {d31}, [r4]     //load A/B/C/D
+    vld1.u8     {q0}, [r0], r1  //src[x]
 
-	vdup.u8	d28, d31[0]			//A
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C
-	vdup.u8	d31, d31[3]			//D
+    vdup.u8 d28, d31[0]         //A
+    vdup.u8 d29, d31[1]         //B
+    vdup.u8 d30, d31[2]         //C
+    vdup.u8 d31, d31[3]         //D
 
-	vext.u8		d1, d0, d1, #1		//src[x+1]
+    vext.u8     d1, d0, d1, #1      //src[x+1]
 
-w8_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{q1}, [r0], r1	//src[x+stride]
-	vld1.u8		{q2}, [r0], r1	//src[x+2*stride]
-	vext.u8		d3, d2, d3, #1		//src[x+stride+1]
-	vext.u8		d5, d4, d5, #1		//src[x+2*stride+1]
+w8_mc_chroma_loop:  // each two pxl row
+    vld1.u8     {q1}, [r0], r1  //src[x+stride]
+    vld1.u8     {q2}, [r0], r1  //src[x+2*stride]
+    vext.u8     d3, d2, d3, #1      //src[x+stride+1]
+    vext.u8     d5, d4, d5, #1      //src[x+2*stride+1]
 
-	vmull.u8		q3, d0, d28			//(src[x] * A)
-	vmlal.u8		q3, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d2, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d3, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
+    vmull.u8        q3, d0, d28         //(src[x] * A)
+    vmlal.u8        q3, d1, d29         //+=(src[x+1] * B)
+    vmlal.u8        q3, d2, d30         //+=(src[x+stride] * C)
+    vmlal.u8        q3, d3, d31         //+=(src[x+stride+1] * D)
+    vrshrn.u16      d6, q3, #6
+    vst1.u8 d6, [r2], r3
 
-	vmull.u8		q3, d2, d28			//(src[x] * A)
-	vmlal.u8		q3, d3, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d4, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d5, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
+    vmull.u8        q3, d2, d28         //(src[x] * A)
+    vmlal.u8        q3, d3, d29         //+=(src[x+1] * B)
+    vmlal.u8        q3, d4, d30         //+=(src[x+stride] * C)
+    vmlal.u8        q3, d5, d31         //+=(src[x+stride+1] * D)
+    vrshrn.u16      d6, q3, #6
+    vst1.u8 d6, [r2], r3
 
-	vmov		q0, q2
-	sub			r5, #2
-	cmp			r5, #0
-	bne			w8_mc_chroma_loop
+    vmov        q0, q2
+    sub         r5, #2
+    cmp         r5, #0
+    bne         w8_mc_chroma_loop
 
-	pop		{r4, r5}
+    pop     {r4, r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
 
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r6, [sp, #16]
-//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-//	we can opti it by adding vert only/ hori only cases, to be continue
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
+    ldr         r6, [sp, #16]
+//  normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+//  we can opti it by adding vert only/ hori only cases, to be continue
+    vld1.u8 {d31}, [r4]     //load A/B/C/D
 
-	vdup.u8	d28, d31[0]			//A
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C
-	vdup.u8	d31, d31[3]			//D
+    vdup.u8 d28, d31[0]         //A
+    vdup.u8 d29, d31[1]         //B
+    vdup.u8 d30, d31[2]         //C
+    vdup.u8 d31, d31[3]         //D
 
-w4_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{d0}, [r0], r1	//a::src[x]
-	vld1.u8		{d2}, [r0], r1	//b::src[x+stride]
-	vld1.u8		{d4}, [r0]			//c::src[x+2*stride]
+w4_mc_chroma_loop:  // each two pxl row
+    vld1.u8     {d0}, [r0], r1  //a::src[x]
+    vld1.u8     {d2}, [r0], r1  //b::src[x+stride]
+    vld1.u8     {d4}, [r0]          //c::src[x+2*stride]
 
-	vshr.u64		d1, d0, #8
-	vshr.u64		d3, d2, #8
-	vshr.u64		d5, d4, #8
+    vshr.u64        d1, d0, #8
+    vshr.u64        d3, d2, #8
+    vshr.u64        d5, d4, #8
 
-	vmov			q3, q1				//b::[0:7]+b::[1~8]
-	vtrn.32		q0, q1				//d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
-	vtrn.32		q3, q2				//d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+    vmov            q3, q1              //b::[0:7]+b::[1~8]
+    vtrn.32     q0, q1              //d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+    vtrn.32     q3, q2              //d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
 
-	vmull.u8		q1, d0, d28			//(src[x] * A)
-	vmlal.u8		q1, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q1, d6, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q1, d7, d31			//+=(src[x+stride+1] * D)
+    vmull.u8        q1, d0, d28         //(src[x] * A)
+    vmlal.u8        q1, d1, d29         //+=(src[x+1] * B)
+    vmlal.u8        q1, d6, d30         //+=(src[x+stride] * C)
+    vmlal.u8        q1, d7, d31         //+=(src[x+stride+1] * D)
 
-	vrshrn.u16		d2, q1, #6
-	vmov		r4, r5, d2
-	str	r4, [r2], r3
-	str	r5, [r2], r3
+    vrshrn.u16      d2, q1, #6
+    vmov        r4, r5, d2
+    str r4, [r2], r3
+    str r5, [r2], r3
 
-	sub			r6, #2
-	cmp			r6, #0
-	bne			w4_mc_chroma_loop
+    sub         r6, #2
+    cmp         r6, #0
+    bne         w4_mc_chroma_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d3, r5, r4					// 0x0014FFFB00010000
+    push        {r4-r5}
+    mov         r4, #20
+    mov         r5, #1
+    sub         r4, r4, r4, lsl #(16-2)
+    lsl         r5, #16
+    ror         r4, #16
+    vmov        d3, r5, r4                  // 0x0014FFFB00010000
 
-	sub			r3, #16
-	ldr			r4, [sp, #8]
+    sub         r3, #16
+    ldr         r4, [sp, #8]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w17_h_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 22(17+5); q0=src[-2]
+    vld1.u8 {d0,d1,d2}, [r0], r1    //only use 22(17+5); q0=src[-2]
 
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
+    vext.8      q2, q0, q1, #1      //q2=src[-1]
+    vext.8      q3, q0, q1, #2      //q3=src[0]
+    vext.8      q8, q0, q1, #3      //q8=src[1]
+    vext.8      q9, q0, q1, #4      //q9=src[2]
+    vext.8      q10, q0, q1, #5     //q10=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d22, q14, q15
+    FILTER_6TAG_8BITS   d0, d4, d6, d16, d18, d20, d22, q14, q15
 
-	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d23, q14, q15
+    FILTER_6TAG_8BITS   d1, d5, d7, d17, d19, d21, d23, q14, q15
 
-	vst1.u8	{d22, d23}, [r2]!		//write [0:15] Byte
+    vst1.u8 {d22, d23}, [r2]!       //write [0:15] Byte
 
-	vsli.64	d2, d2, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d3, d22, q11, q1
+    vsli.64 d2, d2, #8              // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+    FILTER_SINGLE_TAG_8BITS d2, d3, d22, q11, q1
 
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+    vst1.u8 {d2[0]}, [r2], r3       //write 16th Byte
 
-	sub		r4, #1
-	cmp		r4, #0
-	bne		w17_h_mc_luma_loop
-	pop		{r4-r5}
+    sub     r4, #1
+    cmp     r4, #0
+    bne     w17_h_mc_luma_loop
+    pop     {r4-r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d7, r5, r4					// 0x0014FFFB00010000
+    push        {r4-r5}
+    mov         r4, #20
+    mov         r5, #1
+    sub         r4, r4, r4, lsl #(16-2)
+    lsl         r5, #16
+    ror         r4, #16
+    vmov        d7, r5, r4                  // 0x0014FFFB00010000
 
-	sub			r3, #8
-	ldr			r4, [sp, #8]
+    sub         r3, #8
+    ldr         r4, [sp, #8]
 
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
+    sub         r0, #2
+    vmov.u16    q14, #0x0014                // 20
+    vshr.u16    q15, q14, #2                // 5
 
 w9_h_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 14(9+5); q0=src[-2]
-	pld			[r0]
+    vld1.u8 {d0,d1}, [r0], r1   //only use 14(9+5); q0=src[-2]
+    pld         [r0]
 
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
+    vext.8      d2, d0, d1, #1      //d2=src[-1]
+    vext.8      d3, d0, d1, #2      //d3=src[0]
+    vext.8      d4, d0, d1, #3      //d4=src[1]
+    vext.8      d5, d0, d1, #4      //d5=src[2]
+    vext.8      d6, d0, d1, #5      //d6=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d16, q14, q15
+    FILTER_6TAG_8BITS   d0, d2, d3, d4, d5, d6, d16, q14, q15
 
-	sub		r4, #1
-	vst1.u8	{d16}, [r2]!		//write [0:7] Byte
+    sub     r4, #1
+    vst1.u8 {d16}, [r2]!        //write [0:7] Byte
 
-	vsli.64	d2, d1, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d7, d18, q9, q1
-	vst1.u8	{d2[0]}, [r2], r3		//write 8th Byte
+    vsli.64 d2, d1, #8              // [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+    FILTER_SINGLE_TAG_8BITS d2, d7, d18, q9, q1
+    vst1.u8 {d2[0]}, [r2], r3       //write 8th Byte
 
-	cmp		r4, #0
-	bne		w9_h_mc_luma_loop
-	pop		{r4-r5}
+    cmp     r4, #0
+    bne     w9_h_mc_luma_loop
+    pop     {r4-r5}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {q0}, [r0], r1      //q0=src[-2]
+    vld1.u8 {q1}, [r0], r1      //q1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {q2}, [r0], r1      //q2=src[0]
+    vld1.u8 {q3}, [r0], r1      //q3=src[1]
+    vld1.u8 {q8}, [r0], r1      //q8=src[2]
 
 w17_v_mc_luma_loop:
 
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    vst1.u8 {q10}, [r2], r3         //write 2nd 16Byte
 
-	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+    FILTER_6TAG_8BITS   d4, d6, d16, d18, d0, d2, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d5, d7, d17, d19, d1, d3, d21, q14, q15
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    vst1.u8 {q10}, [r2], r3         //write 3rd 16Byte
 
-	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+    FILTER_6TAG_8BITS   d6, d16, d18, d0, d2, d4, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d7, d17, d19, d1, d3, d5, d21, q14, q15
+    vld1.u8 {q3}, [r0], r1      //read 5th row
+    vst1.u8 {q10}, [r2], r3         //write 4th 16Byte
 
-	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+    FILTER_6TAG_8BITS   d16, d18, d0, d2, d4, d6, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d17, d19, d1, d3, d5, d7, d21, q14, q15
+    vld1.u8 {q8}, [r0], r1      //read 6th row
+    vst1.u8 {q10}, [r2], r3         //write 5th 16Byte
 
-	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+    FILTER_6TAG_8BITS   d18, d0, d2, d4, d6, d16, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d19, d1, d3, d5, d7, d17, d21, q14, q15
+    vld1.u8 {q9}, [r0], r1      //read 7th row
+    vst1.u8 {q10}, [r2], r3         //write 6th 16Byte
 
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vld1.u8 {q0}, [r0], r1      //read 8th row
+    vst1.u8 {q10}, [r2], r3         //write 7th 16Byte
 
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+    FILTER_6TAG_8BITS   d2, d4, d6, d16, d18, d0, d20, q14, q15
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d5, d7, d17, d19, d1, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 8th 16Byte
 
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
+    //q2, q3, q8, q9, q0 --> q0~q8
+    vswp    q0, q8
+    vswp    q0, q2
+    vmov    q1, q3
+    vmov    q3, q9                      //q0~q8
 
-	sub		r4, #8
-	cmp		r4, #1
-	bne		w17_v_mc_luma_loop
-	// the last 16Bytes
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+    sub     r4, #8
+    cmp     r4, #1
+    bne     w17_v_mc_luma_loop
+    // the last 16Bytes
+    vld1.u8 {q9}, [r0], r1      //q9=src[3]
+    FILTER_6TAG_8BITS   d0, d2, d4, d6, d16, d18, d20, q14, q15
+    FILTER_6TAG_8BITS   d1, d3, d5, d7, d17, d19, d21, q14, q15
+    vst1.u8 {q10}, [r2], r3         //write 1st 16Byte
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride]
+    pld         [r0]
+    pld         [r0, r1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0}, [r0], r1      //d0=src[-2]
+    vld1.u8 {d1}, [r0], r1      //d1=src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
+    vld1.u8 {d2}, [r0], r1      //d2=src[0]
+    vld1.u8 {d3}, [r0], r1      //d3=src[1]
 
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+    vld1.u8 {d4}, [r0], r1      //d4=src[2]
+    vld1.u8 {d5}, [r0], r1      //d5=src[3]
 
 w9_v_mc_luma_loop:
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vld1.u8 {d0}, [r0], r1      //read 2nd row
+    vst1.u8 {d16}, [r2], r3     //write 1st 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d1, d2, d3, d4, d5, d0, d16, q14, q15
+    vld1.u8 {d1}, [r0], r1      //read 3rd row
+    vst1.u8 {d16}, [r2], r3     //write 2nd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d2, d3, d4, d5, d0, d1, d16, q14, q15
+    vld1.u8 {d2}, [r0], r1      //read 4th row
+    vst1.u8 {d16}, [r2], r3     //write 3rd 8Byte
 
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+    pld         [r0]
+    FILTER_6TAG_8BITS   d3, d4, d5, d0, d1, d2, d16, q14, q15
+    vld1.u8 {d3}, [r0], r1      //read 5th row
+    vst1.u8 {d16}, [r2], r3     //write 4th 8Byte
 
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
+    //d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+    vswp    q0, q2
+    vswp    q1, q2
 
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w9_v_mc_luma_loop
+    sub     r4, #4
+    cmp     r4, #1
+    bne     w9_v_mc_luma_loop
 
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vst1.u8	{d16}, [r2], r3		//write last 8Byte
+    FILTER_6TAG_8BITS   d0, d1, d2, d3, d4, d5, d16, q14, q15
+    vst1.u8 {d16}, [r2], r3     //write last 8Byte
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
-	push		{r4}
-	vpush		{q4-q7}
-	ldr			r4, [sp, #68]
+    push        {r4}
+    vpush       {q4-q7}
+    ldr         r4, [sp, #68]
 
-	sub			r0, #2					//src[-2]
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2                  //src[-2]
+    sub         r0, r0, r1, lsl #1      //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(17+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(17+5), =src[-1]
+    vmov.u16    q14, #0x0014            // 20
+    vld1.u8 {d0-d2}, [r0], r1       //use 21(17+5), =src[-2]
+    vld1.u8 {d3-d5}, [r0], r1       //use 21(17+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2            // 5
 
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(17+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(17+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(17+5), =src[2]
-	sub			r3, #16
+    vld1.u8 {d6-d8}, [r0], r1       //use 21(17+5), =src[0]
+    vld1.u8 {d9-d11}, [r0], r1  //use 21(17+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {d12-d14}, [r0], r1 //use 21(17+5), =src[2]
+    sub         r3, #16
 
 w17_hv_mc_luma_loop:
 
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{d0, d1}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d3, d6, d9, d12, d15, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d4, d7,d10, d13, d16,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0   //output to q0[0]
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d5, d8,d11, d14, d17,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1    //output to q0[1]
+    vst1.u8 {d0, d1}, [r2]!         //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d2, d22, d23, q11 //output to d2[0]
+    vst1.u8 {d2[0]}, [r2], r3       //write 16th Byte
 
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-	vst1.u8	{d3, d4}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d5, d22, d23, q11 //output to d5[0]
-	vst1.u8	{d5[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d0-d2}, [r0], r1       //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d6, d9, d12, d15, d0, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d7,d10, d13, d16, d1,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3   //output to d3
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d8,d11, d14, d17, d2,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4    //output to d4
+    vst1.u8 {d3, d4}, [r2]!     //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d5, d22, d23, q11 //output to d5[0]
+    vst1.u8 {d5[0]}, [r2], r3       //write 16th Byte
 
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d8, d22, d23, q11 //output to d8[0]
-	vst1.u8	{d8[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d3-d5}, [r0], r1       //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d9, d12, d15, d0, d3, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7,d10, d13, d16, d1, d4,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6   //output to d6
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d8,d11, d14, d17, d2, d5,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7    //output to d7
+    vst1.u8 {d6, d7}, [r2]!     //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d8, d22, d23, q11 //output to d8[0]
+    vst1.u8 {d8[0]}, [r2], r3       //write 16th Byte
 
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d11, d22, d23, q11 //output to d11[0]
-	vst1.u8	{d11[0]}, [r2], r3		//write 16th Byte
+    vld1.u8 {d6-d8}, [r0], r1       //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS      d9, d12, d15, d0, d3, d6, q9, q14, q15 // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d10, d13, d16, d1, d4, d7,q10, q14, q15 // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9   //output to d9
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d11, d14, d17, d2, d5, d8,q11, q14, q15 // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10   //output to d10
+    vst1.u8 {d9, d10}, [r2]!        //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d11, d22, d23, q11 //output to d11[0]
+    vst1.u8 {d11[0]}, [r2], r3      //write 16th Byte
 
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2
-	vmov	q2, q8
+    //d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+    vswp    q0, q6
+    vswp    q6, q3
+    vmov    q5, q2
+    vmov    q2, q8
 
-	vmov	d20,d8
-	vmov	q4, q1
-	vmov	q1, q7
-	vmov	d14,d20
+    vmov    d20,d8
+    vmov    q4, q1
+    vmov    q1, q7
+    vmov    d14,d20
 
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w17_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+    sub     r4, #4
+    cmp     r4, #1
+    bne     w17_hv_mc_luma_loop
+    //the last row
+    vld1.u8 {d15-d17}, [r0], r1 //use 21(17+5), =src[3]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d3, d6, d9, d12, d15, q9, q14, q15  // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d4, d7,d10, d13, d16,q10, q14, q15  // 8 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0   //output to q0[0]
+    // vertical filtered into q10/q11
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d5, d8,d11, d14, d17,q11, q14, q15  // only 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q10, q11, q9, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1    //output to q0[1]
+    vst1.u8 {q0}, [r2]!         //write 16Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d2, d22, d23, q11 //output to d2[0]
+    vst1.u8 {d2[0]}, [r2], r3       //write 16th Byte
 
-	vpop		{q4-q7}
-	pop		{r4}
+    vpop        {q4-q7}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
-	push		{r4}
-	vpush		{q4}
-	ldr			r4, [sp, #20]
+    push        {r4}
+    vpush       {q4}
+    ldr         r4, [sp, #20]
 
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
+    sub         r0, #2              //src[-2]
+    sub         r0, r0, r1, lsl #1  //src[-2*src_stride-2]
+    pld         [r0]
+    pld         [r0, r1]
 
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 14(9+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 14(9+5), =src[-1]
+    vmov.u16    q14, #0x0014        // 20
+    vld1.u8 {q0}, [r0], r1  //use 14(9+5), =src[-2]
+    vld1.u8 {q1}, [r0], r1  //use 14(9+5), =src[-1]
 
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
+    pld         [r0]
+    pld         [r0, r1]
+    vshr.u16    q15, q14, #2        // 5
 
-	vld1.u8	{q2}, [r0], r1	//use 14(9+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 14(9+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 14(9+5), =src[2]
-	sub			r3, #8
+    vld1.u8 {q2}, [r0], r1  //use 14(9+5), =src[0]
+    vld1.u8 {q3}, [r0], r1  //use 14(9+5), =src[1]
+    pld         [r0]
+    pld         [r0, r1]
+    vld1.u8 {q4}, [r0], r1  //use 14(9+5), =src[2]
+    sub         r3, #8
 
 w9_hv_mc_luma_loop:
 
-	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q8}, [r0], r1  //use 14(9+5), =src[3]
+    //the 1st row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d16, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d17, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q0}, [r0], r1      //read 2nd row
+    //the 2nd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d2, d4, d6, d8, d16, d0, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d3, d5, d7, d9, d17, d1, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q1}, [r0], r1      //read 3rd row
+    //the 3rd row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d4, d6, d8, d16, d0, d2, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d5, d7, d9, d17, d1, d3, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!			//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+    vld1.u8 {q2}, [r0], r1      //read 4th row
+    //the 4th row
+    pld         [r0]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d6, d8, d16, d0, d2, d4, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d7, d9, d17, d1, d3, d5, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!          //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
 
-	//q4~q8, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1
-	vmov	q1, q8
+    //q4~q8, q0~q2, --> q0~q4
+    vswp    q0, q4
+    vswp    q2, q4
+    vmov    q3, q1
+    vmov    q1, q8
 
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w9_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-	vpop		{q4}
-	pop		{r4}
+    sub     r4, #4
+    cmp     r4, #1
+    bne     w9_hv_mc_luma_loop
+    //the last row
+    vld1.u8 {q8}, [r0], r1  //use 14(9+5), =src[3]
+    // vertical filtered into q9/q10
+    FILTER_6TAG_8BITS_TO_16BITS     d0, d2, d4, d6, d8, d16, q9, q14, q15   // 8 avail
+    FILTER_6TAG_8BITS_TO_16BITS     d1, d3, d5, d7, d9, d17, q10, q14, q15  // 6 avail
+    // horizon filtered
+    UNPACK_2_16BITS_TO_ABC  q9, q10, q11, q12, q13
+    FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18  //output to q9[0]
+    vst1.u8 d18, [r2]!              //write 8Byte
+    UNPACK_1_IN_8x16BITS_TO_8BITS   d19, d20, d21, q10 //output to d19[0]
+    vst1.u8 {d19[0]}, [r2], r3  //write 8th Byte
+    vpop        {q4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
+    ldr         r5, [sp, #16]
+    ldr         r6, [sp, #20]
 
 enc_w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2], r3
-	vld1.u8		{q1}, [r4], r5
-	vld1.u8		{q2}, [r2], r3
-	vld1.u8		{q3}, [r4], r5
+    vld1.u8     {q0}, [r2], r3
+    vld1.u8     {q1}, [r4], r5
+    vld1.u8     {q2}, [r2], r3
+    vld1.u8     {q3}, [r4], r5
 
-	vld1.u8		{q8}, [r2], r3
-	vld1.u8		{q9}, [r4], r5
-	vld1.u8		{q10}, [r2], r3
-	vld1.u8		{q11}, [r4], r5
+    vld1.u8     {q8}, [r2], r3
+    vld1.u8     {q9}, [r4], r5
+    vld1.u8     {q10}, [r2], r3
+    vld1.u8     {q11}, [r4], r5
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {q0}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {q2}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d16, d16, d18
-	AVERAGE_TWO_8BITS		d17, d17, d19
-	vst1.u8		{q8}, [r0], r1
+    AVERAGE_TWO_8BITS       d16, d16, d18
+    AVERAGE_TWO_8BITS       d17, d17, d19
+    vst1.u8     {q8}, [r0], r1
 
-	AVERAGE_TWO_8BITS		d20, d20, d22
-	AVERAGE_TWO_8BITS		d21, d21, d23
-	vst1.u8		{q10}, [r0], r1
+    AVERAGE_TWO_8BITS       d20, d20, d22
+    AVERAGE_TWO_8BITS       d21, d21, d23
+    vst1.u8     {q10}, [r0], r1
 
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w16_pix_avg_loop
+    sub         r6, #4
+    cmp         r6, #0
+    bne         enc_w16_pix_avg_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]
+    push        {r4, r5, r6}
+    ldr         r4, [sp, #12]
+    ldr         r5, [sp, #16]
+    ldr         r6, [sp, #20]
 enc_w8_pix_avg_loop:
 
-	vld1.u8		{d0}, [r2], r3
-	vld1.u8		{d2}, [r4], r5
-	vld1.u8		{d1}, [r2], r3
-	vld1.u8		{d3}, [r4], r5
+    vld1.u8     {d0}, [r2], r3
+    vld1.u8     {d2}, [r4], r5
+    vld1.u8     {d1}, [r2], r3
+    vld1.u8     {d3}, [r4], r5
 
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
+    AVERAGE_TWO_8BITS       d0, d0, d2
+    AVERAGE_TWO_8BITS       d1, d1, d3
+    vst1.u8     {d0}, [r0], r1
+    vst1.u8     {d1}, [r0], r1
 
-	vld1.u8		{d4}, [r2], r3
-	vld1.u8		{d6}, [r4], r5
-	vld1.u8		{d5}, [r2], r3
-	vld1.u8		{d7}, [r4], r5
+    vld1.u8     {d4}, [r2], r3
+    vld1.u8     {d6}, [r4], r5
+    vld1.u8     {d5}, [r2], r3
+    vld1.u8     {d7}, [r4], r5
 
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1
+    AVERAGE_TWO_8BITS       d4, d4, d6
+    AVERAGE_TWO_8BITS       d5, d5, d7
+    vst1.u8     {d4}, [r0], r1
+    vst1.u8     {d5}, [r0], r1
 
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w8_pix_avg_loop
+    sub         r6, #4
+    cmp         r6, #0
+    bne         enc_w8_pix_avg_loop
 
-	pop		{r4, r5, r6}
+    pop     {r4, r5, r6}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/common/arm64/expand_picture_aarch64_neon.S
+++ b/codec/common/arm64/expand_picture_aarch64_neon.S
@@ -53,88 +53,88 @@
     sub x8, x8, #1
     cbnz x8, _expand_picture_luma_loop2
     //for the top and bottom expand
-	add x2, x2, #64
-	sub x0, x0, #32
+    add x2, x2, #64
+    sub x0, x0, #32
     madd x4, x1, x3, x0
     sub x4, x4, x1
 _expand_picture_luma_loop0:
-	mov x5, #32
+    mov x5, #32
     msub x5, x5, x1, x0
-	add x6, x4, x1
+    add x6, x4, x1
     ld1 {v0.16b}, [x0], x10
     ld1 {v1.16b}, [x4], x10
-	mov x8, #32
+    mov x8, #32
 _expand_picture_luma_loop1:
-	st1 {v0.16b}, [x5], x1
-	st1 {v1.16b}, [x6], x1
-	sub x8, x8, #1
+    st1 {v0.16b}, [x5], x1
+    st1 {v1.16b}, [x6], x1
+    sub x8, x8, #1
     cbnz x8, _expand_picture_luma_loop1
 
-	sub x2, x2, #16
-	cbnz x2, _expand_picture_luma_loop0
+    sub x2, x2, #16
+    cbnz x2, _expand_picture_luma_loop0
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
-	//Save the dst
-	mov x7, x0
-	mov x8, x3
+    //Save the dst
+    mov x7, x0
+    mov x8, x3
     mov x10, #16
-	add x4, x7, x2
-	sub x4, x4, #1
+    add x4, x7, x2
+    sub x4, x4, #1
     //For the left and right expand
 _expand_picture_chroma_loop2:
-	sub x5, x7, #16
-	add x6, x4, #1
+    sub x5, x7, #16
+    add x6, x4, #1
 
-	ld1r {v0.16b}, [x7], x1
-	ld1r {v1.16b}, [x4], x1
+    ld1r {v0.16b}, [x7], x1
+    ld1r {v1.16b}, [x4], x1
 
-	st1 {v0.16b}, [x5]
-	st1 {v1.16b}, [x6]
-	sub x8, x8, #1
-	cbnz x8, _expand_picture_chroma_loop2
+    st1 {v0.16b}, [x5]
+    st1 {v1.16b}, [x6]
+    sub x8, x8, #1
+    cbnz x8, _expand_picture_chroma_loop2
 
-	//for the top and bottom expand
-	add x2, x2, #32
+    //for the top and bottom expand
+    add x2, x2, #32
     //
     mov x9, x2
     mov x11, #15
     bic x2, x2, x11
     //
-	sub x0, x0, #16
-	madd x4, x1, x3, x0
-	sub x4, x4, x1
+    sub x0, x0, #16
+    madd x4, x1, x3, x0
+    sub x4, x4, x1
 _expand_picture_chroma_loop0:
-	mov x5, #16
+    mov x5, #16
     msub x5, x5, x1, x0
-	add x6, x4, x1
-	ld1 {v0.16b}, [x0], x10
-	ld1 {v1.16b}, [x4], x10
+    add x6, x4, x1
+    ld1 {v0.16b}, [x0], x10
+    ld1 {v1.16b}, [x4], x10
 
-	mov x8, #16
+    mov x8, #16
 _expand_picture_chroma_loop1:
-	st1 {v0.16b}, [x5], x1
-	st1 {v1.16b}, [x6], x1
-	sub x8, x8, #1
+    st1 {v0.16b}, [x5], x1
+    st1 {v1.16b}, [x6], x1
+    sub x8, x8, #1
     cbnz x8, _expand_picture_chroma_loop1
 
-	sub x2, x2, #16
-	cbnz x2, _expand_picture_chroma_loop0
+    sub x2, x2, #16
+    cbnz x2, _expand_picture_chroma_loop0
 
     and x9, x9, #15
     sub x9, x9, #8
     cbnz x9, _expand_picture_chroma_end
-	mov x5, #16
+    mov x5, #16
     msub x5, x5, x1, x0
-	add x6, x4, x1
-	ld1 {v0.8b}, [x0]
-	ld1 {v1.8b}, [x4]
+    add x6, x4, x1
+    ld1 {v0.8b}, [x0]
+    ld1 {v1.8b}, [x4]
 
-	mov x8, #16
+    mov x8, #16
 _expand_picture_chroma_loop3:
-	st1 {v0.8b}, [x5], x1
-	st1 {v1.8b}, [x6], x1
-	sub x8, x8, #1
+    st1 {v0.8b}, [x5], x1
+    st1 {v1.8b}, [x6], x1
+    sub x8, x8, #1
     cbnz x8, _expand_picture_chroma_loop3
 _expand_picture_chroma_end:
 
--- a/codec/common/arm64/mc_aarch64_neon.S
+++ b/codec/common/arm64/mc_aarch64_neon.S
@@ -38,32 +38,32 @@
 
 #ifdef __APPLE__
 
-.macro	FILTER_6TAG_8BITS1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun $6.8b, v18.8h, #5
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS2
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun2 $6.16b, v18.8h, #5
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -70,13 +70,13 @@
     sqrshrun $6.8b, v18.8h, #5
     uaddl  v19.8h, $2.8b, $6.8b
     rshrn $6.8b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -83,13 +83,13 @@
     sqrshrun2 $6.16b, v18.8h, #5
     uaddl2  v19.8h, $2.16b, $6.16b
     rshrn2 $6.16b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, $0.8b, $5.8b //v18=src[-2]+src[3]
-    uaddl v19.8h, $2.8b, $3.8b	//src[0]+src[1]
+    uaddl v19.8h, $2.8b, $3.8b  //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, $1.8b, $4.8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -96,13 +96,13 @@
     sqrshrun $6.8b, v18.8h, #5
     uaddl  v19.8h, $3.8b, $6.8b
     rshrn $6.8b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, $0.16b, $5.16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, $2.16b, $3.16b	//src[0]+src[1]
+    uaddl2 v19.8h, $2.16b, $3.16b   //src[0]+src[1]
     mla v18.8h, v19.8h, $7.8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, $1.16b, $4.16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, $8.8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -109,134 +109,134 @@
     sqrshrun2 $6.16b, v18.8h, #5
     uaddl2  v19.8h, $3.16b, $6.16b
     rshrn2 $6.16b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_TO_16BITS1
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl	$6.8h, $0.8b, $5.8b		//dst_q=src[-2]+src[3]
-    uaddl	v31.8h, $2.8b, $3.8b	//src[0]+src[1]
-    mla	$6.8h, v31.8h, $7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl	v31.8h, $1.8b, $4.8b	//src[-1]+src[2]
-    mls	$6.8h, v31.8h, $8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+.macro FILTER_6TAG_8BITS_TO_16BITS1
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl   $6.8h, $0.8b, $5.8b     //dst_q=src[-2]+src[3]
+    uaddl   v31.8h, $2.8b, $3.8b    //src[0]+src[1]
+    mla $6.8h, v31.8h, $7.8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl   v31.8h, $1.8b, $4.8b    //src[-1]+src[2]
+    mls $6.8h, v31.8h, $8.8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_TO_16BITS2
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl2	$6.8h, $0.16b, $5.16b		//dst_q=src[-2]+src[3]
-    uaddl2	v31.8h, $2.16b, $3.16b	//src[0]+src[1]
-    mla	$6.8h, v31.8h, $7.8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl2	v31.8h, $1.16b, $4.16b	//src[-1]+src[2]
-    mls	$6.8h, v31.8h, $8.8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+.macro FILTER_6TAG_8BITS_TO_16BITS2
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl2  $6.8h, $0.16b, $5.16b       //dst_q=src[-2]+src[3]
+    uaddl2  v31.8h, $2.16b, $3.16b  //src[0]+src[1]
+    mla $6.8h, v31.8h, $7.8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl2  v31.8h, $1.16b, $4.16b  //src[-1]+src[2]
+    mls $6.8h, v31.8h, $8.8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
-.macro	FILTER_3_IN_16BITS_TO_8BITS1
-//	{	// input:a, b, c, dst_d;
-    sub	$0.8h, $0.8h, $1.8h			//a-b
-    sshr	$0.8h, $0.8h, #2			//(a-b)/4
-    sub	$0.8h, $0.8h, $1.8h			//(a-b)/4-b
-    add	$0.8h, $0.8h, $2.8h			//(a-b)/4-b+c
-    sshr	$0.8h, $0.8h, #2			//((a-b)/4-b+c)/4
-    add	$0.8h, $0.8h, $2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun	$3.8b, $0.8h, #6		//(+32)>>6
-//	}
+.macro FILTER_3_IN_16BITS_TO_8BITS1
+//  {   // input:a, b, c, dst_d;
+    sub $0.8h, $0.8h, $1.8h         //a-b
+    sshr    $0.8h, $0.8h, #2            //(a-b)/4
+    sub $0.8h, $0.8h, $1.8h         //(a-b)/4-b
+    add $0.8h, $0.8h, $2.8h         //(a-b)/4-b+c
+    sshr    $0.8h, $0.8h, #2            //((a-b)/4-b+c)/4
+    add $0.8h, $0.8h, $2.8h         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun    $3.8b, $0.8h, #6        //(+32)>>6
+//  }
 .endm
 
-.macro	FILTER_3_IN_16BITS_TO_8BITS2
-//	{	// input:a, b, c, dst_d;
-    sub	$0.8h, $0.8h, $1.8h			//a-b
-    sshr	$0.8h, $0.8h, #2			//(a-b)/4
-    sub	$0.8h, $0.8h, $1.8h			//(a-b)/4-b
-    add	$0.8h, $0.8h, $2.8h			//(a-b)/4-b+c
-    sshr	$0.8h, $0.8h, #2			//((a-b)/4-b+c)/4
-    add	$0.8h, $0.8h, $2.8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun2	$3.16b, $0.8h, #6		//(+32)>>6
-//	}
+.macro FILTER_3_IN_16BITS_TO_8BITS2
+//  {   // input:a, b, c, dst_d;
+    sub $0.8h, $0.8h, $1.8h         //a-b
+    sshr    $0.8h, $0.8h, #2            //(a-b)/4
+    sub $0.8h, $0.8h, $1.8h         //(a-b)/4-b
+    add $0.8h, $0.8h, $2.8h         //(a-b)/4-b+c
+    sshr    $0.8h, $0.8h, #2            //((a-b)/4-b+c)/4
+    add $0.8h, $0.8h, $2.8h         //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun2   $3.16b, $0.8h, #6       //(+32)>>6
+//  }
 .endm
 
-.macro	UNPACK_2_16BITS_TO_ABC
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    ext	$4.16b, $0.16b, $1.16b, #4		//src[0]
-    ext	$3.16b, $0.16b, $1.16b, #6		//src[1]
-    add	$4.8h, $4.8h, $3.8h					//c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    ext $4.16b, $0.16b, $1.16b, #4      //src[0]
+    ext $3.16b, $0.16b, $1.16b, #6      //src[1]
+    add $4.8h, $4.8h, $3.8h                 //c=src[0]+src[1]
 
-    ext	$3.16b, $0.16b, $1.16b, #2		//src[-1]
-    ext	$2.16b, $0.16b, $1.16b, #8		//src[2]
-    add	$3.8h, $3.8h, $2.8h					//b=src[-1]+src[2]
+    ext $3.16b, $0.16b, $1.16b, #2      //src[-1]
+    ext $2.16b, $0.16b, $1.16b, #8      //src[2]
+    add $3.8h, $3.8h, $2.8h                 //b=src[-1]+src[2]
 
-    ext	$2.16b, $0.16b, $1.16b, #10		//src[3]
-    add	$2.8h, $2.8h, $0.8h					//a=src[-2]+src[3]
-//	}
+    ext $2.16b, $0.16b, $1.16b, #10     //src[3]
+    add $2.8h, $2.8h, $0.8h                 //a=src[-2]+src[3]
+//  }
 .endm
 
-.macro	AVERAGE_TWO_8BITS1
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl	v30.8h, $2.8b, $1.8b
-    rshrn	$0.8b, v30.8h, #1
-//	}
+.macro AVERAGE_TWO_8BITS1
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl   v30.8h, $2.8b, $1.8b
+    rshrn   $0.8b, v30.8h, #1
+//  }
 .endm
 
-.macro	AVERAGE_TWO_8BITS2
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl2	v30.8h, $2.16b, $1.16b
-    rshrn2	$0.16b, v30.8h, #1
-//	}
+.macro AVERAGE_TWO_8BITS2
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl2  v30.8h, $2.16b, $1.16b
+    rshrn2  $0.16b, v30.8h, #1
+//  }
 .endm
 
-.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X},
-    rev64	$2.8b, $0.8b				// X[5][4][3][2][1][0]O
-    uaddl	$2.8h, $0.8b, $2.8b			// each 16bits, *[50][41][32][23][14][05]*
-    mul	$2.4h, $2.4h, $1.4h			// 0+1*[50]-5*[41]+20[32]
+.macro FILTER_SINGLE_TAG_8BITS      // when width=17/9, used
+//  {   // input: src_d{Y[0][1][2][3][4][5]X},
+    rev64   $2.8b, $0.8b                // X[5][4][3][2][1][0]O
+    uaddl   $2.8h, $0.8b, $2.8b         // each 16bits, *[50][41][32][23][14][05]*
+    mul $2.4h, $2.4h, $1.4h         // 0+1*[50]-5*[41]+20[32]
     addv $3, $2.4h
     sqrshrun $0.8b, $0.8h, #5
-//	}
+//  }
 .endm
 
-.macro	UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
-//	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+.macro UNPACK_FILTER_SINGLE_TAG_16BITS // v0, v1, v22, v23
+//  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
     ext.16b $3, $1, $1, #14       // X[0][1][2][3][4][5]O
     ext.16b $4, $3, $3, #8      // [3][4][5]OX[0][1][2]
-    rev64  $4.8h, $4.8h			// X[5][4][3][2][1][0]O
+    rev64  $4.8h, $4.8h         // X[5][4][3][2][1][0]O
     add   $3.8h, $3.8h, $4.8h    // each 16bits, *[50][41][32][23][14][05]*
-    smull $3.4s, $3.4h, $2.4h			// 0+1*[50]-5*[41]+20[32]
+    smull $3.4s, $3.4h, $2.4h           // 0+1*[50]-5*[41]+20[32]
     saddlv $5, $3.4s
     //sshr $0.2d, $0.2d, #4
     sqrshrun $0.2s, $0.2d, #10
     uqxtn $0.4h, $0.4s
     uqxtn $0.8b, $0.8h
-   //	}
+   //   }
 .endm
 
 #else
-.macro	FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
-    uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
+    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun \arg6\().8b, v18.8h, #5
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
+    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
     sqrshrun2 \arg6\().16b, v18.8h, #5
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
-    uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
+    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -243,13 +243,13 @@
     sqrshrun \arg6\().8b, v18.8h, #5
     uaddl  v19.8h, \arg2\().8b, \arg6\().8b
     rshrn \arg6\().8b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
+    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -256,13 +256,13 @@
     sqrshrun2 \arg6\().16b, v18.8h, #5
     uaddl2  v19.8h, \arg2\().16b, \arg6\().16b
     rshrn2 \arg6\().16b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS1_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl v18.8h, \arg0\().8b, \arg5\().8b //v18=src[-2]+src[3]
-    uaddl v19.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
+    uaddl v19.8h, \arg2\().8b, \arg3\().8b  //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl v19.8h, \arg1\().8b, \arg4\().8b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -269,13 +269,13 @@
     sqrshrun \arg6\().8b, v18.8h, #5
     uaddl  v19.8h, \arg3\().8b, \arg6\().8b
     rshrn \arg6\().8b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
+.macro FILTER_6TAG_8BITS2_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
     uaddl2 v18.8h, \arg0\().16b, \arg5\().16b //v18=src[-2]+src[3]
-    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
+    uaddl2 v19.8h, \arg2\().16b, \arg3\().16b   //src[0]+src[1]
     mla v18.8h, v19.8h, \arg7\().8h  //v18 += 20*(src[0]+src[1]), 2 cycles
     uaddl2 v19.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
     mls v18.8h, v19.8h, \arg8\().8h  //v18 -= 5*(src[-1]+src[2]), 2 cycles
@@ -282,106 +282,106 @@
     sqrshrun2 \arg6\().16b, v18.8h, #5
     uaddl2  v19.8h, \arg3\().16b, \arg6\().16b
     rshrn2 \arg6\().16b, v19.8h, #1
-//	}
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl	\arg6\().8h, \arg0\().8b, \arg5\().8b		//dst_q=src[-2]+src[3]
-    uaddl	v31.8h, \arg2\().8b, \arg3\().8b	//src[0]+src[1]
-    mla	\arg6\().8h, v31.8h, \arg7\().8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl	v31.8h, \arg1\().8b, \arg4\().8b	//src[-1]+src[2]
-    mls	\arg6\().8h, v31.8h, \arg8\().8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+.macro FILTER_6TAG_8BITS_TO_16BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl   \arg6\().8h, \arg0\().8b, \arg5\().8b       //dst_q=src[-2]+src[3]
+    uaddl   v31.8h, \arg2\().8b, \arg3\().8b    //src[0]+src[1]
+    mla \arg6\().8h, v31.8h, \arg7\().8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl   v31.8h, \arg1\().8b, \arg4\().8b    //src[-1]+src[2]
+    mls \arg6\().8h, v31.8h, \arg8\().8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
-.macro	FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
-    uaddl2	\arg6\().8h, \arg0\().16b, \arg5\().16b		//dst_q=src[-2]+src[3]
-    uaddl2	v31.8h, \arg2\().16b, \arg3\().16b	//src[0]+src[1]
-    mla	\arg6\().8h, v31.8h, \arg7\().8h	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    uaddl2	v31.8h, \arg1\().16b, \arg4\().16b	//src[-1]+src[2]
-    mls	\arg6\().8h, v31.8h, \arg8\().8h	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
+.macro FILTER_6TAG_8BITS_TO_16BITS2 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   // input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:v31
+    uaddl2  \arg6\().8h, \arg0\().16b, \arg5\().16b     //dst_q=src[-2]+src[3]
+    uaddl2  v31.8h, \arg2\().16b, \arg3\().16b  //src[0]+src[1]
+    mla \arg6\().8h, v31.8h, \arg7\().8h    //dst_q += 20*(src[0]+src[1]), 2 cycles
+    uaddl2  v31.8h, \arg1\().16b, \arg4\().16b  //src[-1]+src[2]
+    mls \arg6\().8h, v31.8h, \arg8\().8h    //dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//  }
 .endm
 
-.macro	FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//a-b
-    sshr	\arg0\().8h, \arg0\().8h, #2			//(a-b)/4
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//(a-b)/4-b
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//(a-b)/4-b+c
-    sshr	\arg0\().8h, \arg0\().8h, #2			//((a-b)/4-b+c)/4
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun	\arg3\().8b, \arg0\().8h, #6		//(+32)>>6
-//	}
+.macro FILTER_3_IN_16BITS_TO_8BITS1 arg0, arg1, arg2, arg3
+//  {   // input:a, b, c, dst_d;
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //a-b
+    sshr    \arg0\().8h, \arg0\().8h, #2            //(a-b)/4
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //(a-b)/4-b
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //(a-b)/4-b+c
+    sshr    \arg0\().8h, \arg0\().8h, #2            //((a-b)/4-b+c)/4
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun    \arg3\().8b, \arg0\().8h, #6        //(+32)>>6
+//  }
 .endm
 
-.macro	FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//a-b
-    sshr	\arg0\().8h, \arg0\().8h, #2			//(a-b)/4
-    sub	\arg0\().8h, \arg0\().8h, \arg1\().8h			//(a-b)/4-b
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//(a-b)/4-b+c
-    sshr	\arg0\().8h, \arg0\().8h, #2			//((a-b)/4-b+c)/4
-    add	\arg0\().8h, \arg0\().8h, \arg2\().8h			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    sqrshrun2	\arg3\().16b, \arg0\().8h, #6		//(+32)>>6
-//	}
+.macro FILTER_3_IN_16BITS_TO_8BITS2 arg0, arg1, arg2, arg3
+//  {   // input:a, b, c, dst_d;
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //a-b
+    sshr    \arg0\().8h, \arg0\().8h, #2            //(a-b)/4
+    sub \arg0\().8h, \arg0\().8h, \arg1\().8h           //(a-b)/4-b
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //(a-b)/4-b+c
+    sshr    \arg0\().8h, \arg0\().8h, #2            //((a-b)/4-b+c)/4
+    add \arg0\().8h, \arg0\().8h, \arg2\().8h           //((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    sqrshrun2   \arg3\().16b, \arg0\().8h, #6       //(+32)>>6
+//  }
 .endm
 
-.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    ext	\arg4\().16b, \arg0\().16b, \arg1\().16b, #4		//src[0]
-    ext	\arg3\().16b, \arg0\().16b, \arg1\().16b, #6		//src[1]
-    add	\arg4\().8h, \arg4\().8h, \arg3\().8h					//c=src[0]+src[1]
+.macro UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+//  {   // input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    ext \arg4\().16b, \arg0\().16b, \arg1\().16b, #4        //src[0]
+    ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #6        //src[1]
+    add \arg4\().8h, \arg4\().8h, \arg3\().8h                   //c=src[0]+src[1]
 
-    ext	\arg3\().16b, \arg0\().16b, \arg1\().16b, #2		//src[-1]
-    ext	\arg2\().16b, \arg0\().16b, \arg1\().16b, #8		//src[2]
-    add	\arg3\().8h, \arg3\().8h, \arg2\().8h					//b=src[-1]+src[2]
+    ext \arg3\().16b, \arg0\().16b, \arg1\().16b, #2        //src[-1]
+    ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #8        //src[2]
+    add \arg3\().8h, \arg3\().8h, \arg2\().8h                   //b=src[-1]+src[2]
 
-    ext	\arg2\().16b, \arg0\().16b, \arg1\().16b, #10		//src[3]
-    add	\arg2\().8h, \arg2\().8h, \arg0\().8h					//a=src[-2]+src[3]
-//	}
+    ext \arg2\().16b, \arg0\().16b, \arg1\().16b, #10       //src[3]
+    add \arg2\().8h, \arg2\().8h, \arg0\().8h                   //a=src[-2]+src[3]
+//  }
 .endm
 
-.macro	AVERAGE_TWO_8BITS1 arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl	v30.8h, \arg2\().8b, \arg1\().8b
-    rshrn	\arg0\().8b, v30.8h, #1
-//	}
+.macro AVERAGE_TWO_8BITS1 arg0, arg1, arg2
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl   v30.8h, \arg2\().8b, \arg1\().8b
+    rshrn   \arg0\().8b, v30.8h, #1
+//  }
 .endm
 
-.macro	AVERAGE_TWO_8BITS2 arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: v5
-    uaddl2	v30.8h, \arg2\().16b, \arg1\().16b
-    rshrn2	\arg0\().16b, v30.8h, #1
-//	}
+.macro AVERAGE_TWO_8BITS2 arg0, arg1, arg2
+//  {   // input:dst_d, src_d A and B; working: v5
+    uaddl2  v30.8h, \arg2\().16b, \arg1\().16b
+    rshrn2  \arg0\().16b, v30.8h, #1
+//  }
 .endm
 
-.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
+.macro FILTER_SINGLE_TAG_8BITS arg0, arg1, arg2, arg3
 // when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X},
-    rev64	\arg2\().8b, \arg0\().8b				// X[5][4][3][2][1][0]O
-    uaddl	\arg2\().8h, \arg0\().8b, \arg2\().8b			// each 16bits, *[50][41][32][23][14][05]*
-    mul	\arg2\().4h, \arg2\().4h, \arg1\().4h			// 0+1*[50]-5*[41]+20[32]
+//  {   // input: src_d{Y[0][1][2][3][4][5]X},
+    rev64   \arg2\().8b, \arg0\().8b                // X[5][4][3][2][1][0]O
+    uaddl   \arg2\().8h, \arg0\().8b, \arg2\().8b           // each 16bits, *[50][41][32][23][14][05]*
+    mul \arg2\().4h, \arg2\().4h, \arg1\().4h           // 0+1*[50]-5*[41]+20[32]
     addv \arg3, \arg2\().4h
     sqrshrun \arg0\().8b, \arg0\().8h, #5
-//	}
+//  }
 .endm
 
-.macro	UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
-//	{	// each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
+.macro UNPACK_FILTER_SINGLE_TAG_16BITS arg0, arg1, arg2, arg3, arg4, arg5
+//  {   // each 16bits; input: d_dst, d_src[0:5], para, working, working, d(low part of d_dst)
     ext \arg3\().16b, \arg1\().16b, \arg1\().16b, #14       // X[0][1][2][3][4][5]O
     ext \arg4\().16b, \arg3\().16b, \arg3\().16b, #8      // [3][4][5]OX[0][1][2]
-    rev64  \arg4\().8h, \arg4\().8h			// X[5][4][3][2][1][0]O
+    rev64  \arg4\().8h, \arg4\().8h         // X[5][4][3][2][1][0]O
     add   \arg3\().8h, \arg3\().8h, \arg4\().8h    // each 16bits, *[50][41][32][23][14][05]*
-    smull \arg3\().4s, \arg3\().4h, \arg2\().4h			// 0+1*[50]-5*[41]+20[32]
+    smull \arg3\().4s, \arg3\().4h, \arg2\().4h         // 0+1*[50]-5*[41]+20[32]
     saddlv \arg5, \arg3\().4s
     //sshr \arg0\().2d, \arg0\().2d, #4
     sqrshrun \arg0\().2s, \arg0\().2d, #10
     uqxtn \arg0\().4h, \arg0\().4s
     uqxtn \arg0\().8b, \arg0\().8h
-   //	}
+   //   }
 .endm
 #endif
 
@@ -405,7 +405,7 @@
 
     sub x4, x4, #1
     st1 {v20.16b}, [x2], x3 //write 16Byte
-	cbnz x4, w16_h_mc_luma_loop
+    cbnz x4, w16_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq8_AArch64_neon
@@ -426,7 +426,7 @@
 
     sub x4, x4, #1
     st1 {v20.8b}, [x2], x3 //write 8Byte
-	cbnz x4, w8_h_mc_luma_loop
+    cbnz x4, w8_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20WidthEq4_AArch64_neon
@@ -461,7 +461,7 @@
     st1 {v20.s}[0], [x2], x3 //write 4Byte
     st1 {v20.s}[1], [x2], x3 //write 4Byte
     sub x4, x4, #1
-	cbnz x4, w4_h_mc_luma_loop
+    cbnz x4, w4_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq16_AArch64_neon
@@ -483,7 +483,7 @@
 
     sub x4, x4, #1
     st1 {v20.16b}, [x2], x3 //write 16Byte
-	cbnz x4, w16_xy_10_mc_luma_loop
+    cbnz x4, w16_xy_10_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -505,7 +505,7 @@
 
     sub x4, x4, #1
     st1 {v20.8b}, [x2], x3 //write 8Byte
-	cbnz x4, w8_xy_10_mc_luma_loop
+    cbnz x4, w8_xy_10_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer10WidthEq4_AArch64_neon
@@ -540,7 +540,7 @@
     st1 {v20.s}[0], [x2], x3 //write 4Byte
     st1 {v20.s}[1], [x2], x3 //write 4Byte
     sub x4, x4, #1
-	cbnz x4, w4_xy_10_mc_luma_loop
+    cbnz x4, w4_xy_10_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -563,7 +563,7 @@
 
     sub x4, x4, #1
     st1 {v20.16b}, [x2], x3 //write 16Byte
-	cbnz x4, w16_xy_30_mc_luma_loop
+    cbnz x4, w16_xy_30_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -585,7 +585,7 @@
 
     sub x4, x4, #1
     st1 {v20.8b}, [x2], x3 //write 8Byte
-	cbnz x4, w8_xy_30_mc_luma_loop
+    cbnz x4, w8_xy_30_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer30WidthEq4_AArch64_neon
@@ -620,7 +620,7 @@
     st1 {v20.s}[0], [x2], x3 //write 4Byte
     st1 {v20.s}[1], [x2], x3 //write 4Byte
     sub x4, x4, #1
-	cbnz x4, w4_xy_30_mc_luma_loop
+    cbnz x4, w4_xy_30_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -703,7 +703,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w16_xy_01_mc_luma_loop
+    cbnz x4, w16_xy_01_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -753,7 +753,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w8_xy_01_mc_luma_loop
+    cbnz x4, w8_xy_01_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -809,7 +809,7 @@
     mov.8b v5, v21
 
     sub x4, x4, #4
-	cbnz x4, w4_xy_01_mc_luma_loop
+    cbnz x4, w4_xy_01_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -892,7 +892,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w16_xy_03_mc_luma_loop
+    cbnz x4, w16_xy_03_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -942,7 +942,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w8_xy_03_mc_luma_loop
+    cbnz x4, w8_xy_03_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -998,7 +998,7 @@
     mov.8b v5, v21
 
     sub x4, x4, #4
-	cbnz x4, w4_xy_03_mc_luma_loop
+    cbnz x4, w4_xy_03_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1081,7 +1081,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w16_xy_02_mc_luma_loop
+    cbnz x4, w16_xy_02_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1131,7 +1131,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w8_xy_02_mc_luma_loop
+    cbnz x4, w8_xy_02_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1187,7 +1187,7 @@
     mov.8b v5, v21
 
     sub x4, x4, #4
-	cbnz x4, w4_xy_02_mc_luma_loop
+    cbnz x4, w4_xy_02_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1220,12 +1220,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 0 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1234,12 +1234,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 1 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1248,12 +1248,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 2 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1262,12 +1262,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 3 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1276,12 +1276,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 4 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1290,12 +1290,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 5 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1304,12 +1304,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 6 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1318,12 +1318,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x3 //write 16Byte : 7 line
 
     mov.16b v5, v11
@@ -1348,7 +1348,7 @@
     mov.16b v16, v30
 
     sub x4, x4, #8
-	cbnz x4, w16_hv_mc_luma_loop
+    cbnz x4, w16_hv_mc_luma_loop
 
     ldp d14, d15, [sp], #16
     ldp d12, d13, [sp], #16
@@ -1381,8 +1381,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 0 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1391,8 +1391,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 1 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1401,8 +1401,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 2 line
 
     //prfm pldl1strm, [x0, x1]
@@ -1411,8 +1411,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x3 //write 8Byte : 3 line
 
 
@@ -1424,7 +1424,7 @@
     mov.16b v4, v30
 
     sub x4, x4, #4
-	cbnz x4, w8_hv_mc_luma_loop
+    cbnz x4, w8_hv_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1458,12 +1458,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v22, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v23, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v24, v25, v26
-    UNPACK_2_16BITS_TO_ABC	v22, v23, v28, v29, v30
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v24, v25, v26
+    UNPACK_2_16BITS_TO_ABC  v22, v23, v28, v29, v30
     zip1 v24.2d, v24.2d, v28.2d
     zip1 v25.2d, v25.2d, v29.2d
     zip1 v26.2d, v26.2d, v30.2d
-	FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27	//output to v27[0]
+    FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
     st1 {v27.s}[0], [x2], x3 //write 4Byte : 0 line
     st1 {v27.s}[1], [x2], x3 //write 4Byte : 1 line
 
@@ -1478,12 +1478,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v22, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v23, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v24, v25, v26
-    UNPACK_2_16BITS_TO_ABC	v22, v23, v28, v29, v30
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v24, v25, v26
+    UNPACK_2_16BITS_TO_ABC  v22, v23, v28, v29, v30
     zip1 v24.2d, v24.2d, v28.2d
     zip1 v25.2d, v25.2d, v29.2d
     zip1 v26.2d, v26.2d, v30.2d
-	FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27	//output to v27[0]
+    FILTER_3_IN_16BITS_TO_8BITS1 v24, v25, v26, v27 //output to v27[0]
     st1 {v27.s}[0], [x2], x3 //write 4Byte : 2 line
     st1 {v27.s}[1], [x2], x3 //write 4Byte : 3 line
 
@@ -1495,7 +1495,7 @@
     mov.16b v4, v30
 
     sub x4, x4, #4
-	cbnz x4, w4_hv_mc_luma_loop
+    cbnz x4, w4_hv_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq16_AArch64_neon
@@ -1509,7 +1509,7 @@
     st1 {v1.16b}, [x2], x3 //write 16Byte : 1 line
 
     sub x4, x4, #2
-	cbnz x4, w16_copy_loop
+    cbnz x4, w16_copy_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq8_AArch64_neon
@@ -1523,7 +1523,7 @@
     st1 {v1.8b}, [x2], x3 //write 16Byte : 1 line
 
     sub x4, x4, #2
-	cbnz x4, w8_copy_loop
+    cbnz x4, w8_copy_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McCopyWidthEq4_AArch64_neon
@@ -1537,7 +1537,7 @@
     st1 {v1.s}[0], [x2], x3 //write 16Byte : 1 line
 
     sub x4, x4, #2
-	cbnz x4, w4_copy_loop
+    cbnz x4, w4_copy_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq16_AArch64_neon
@@ -1570,7 +1570,7 @@
     st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, enc_w16_pix_avg_loop
+    cbnz x6, enc_w16_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixStrideAvgWidthEq8_AArch64_neon
@@ -1607,7 +1607,7 @@
     st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, enc_w8_pix_avg_loop
+    cbnz x6, enc_w8_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq16_AArch64_neon
@@ -1649,7 +1649,7 @@
     st1 {v16.16b}, [x0], x1 //write 16Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, w16_pix_avg_loop
+    cbnz x6, w16_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN PixelAvgWidthEq8_AArch64_neon
@@ -1686,7 +1686,7 @@
     st1 {v16.8b}, [x0], x1 //write 8Byte : 3 line
 
     sub x6, x6, #4
-	cbnz x6, w8_pix_avg_loop
+    cbnz x6, w8_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1707,7 +1707,7 @@
     st1 {v2.s}[1], [x0], x1 //write 4Byte : 1 line
 
     sub x6, x6, #2
-	cbnz x6, w4_pix_avg_loop
+    cbnz x6, w4_pix_avg_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq8_AArch64_neon
@@ -1738,7 +1738,7 @@
     mov.16b v0, v18
     mov.16b v1, v19
     sub x5, x5, #2
-	cbnz x5, w8_mc_chroma_loop
+    cbnz x5, w8_mc_chroma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McChromaWidthEq4_AArch64_neon
@@ -1767,7 +1767,7 @@
     mov.8b v0, v18
     mov.8b v1, v19
     sub x5, x5, #2
-	cbnz x5, w4_mc_chroma_loop
+    cbnz x5, w4_mc_chroma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1793,11 +1793,11 @@
     st1 {v20.16b}, [x2], x5 //write 16Byte
 
     ext.8b v21, v4, v4, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	v21, v22, v23, h21
-	st1 {v21.b}[0], [x2], x3 //write 16th Byte
+    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+    st1 {v21.b}[0], [x2], x3 //write 16th Byte
 
     sub x4, x4, #1
-	cbnz x4, w17_h_mc_luma_loop
+    cbnz x4, w17_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 WELS_ASM_ARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
@@ -1821,11 +1821,11 @@
     st1 {v20.8b}, [x2], x5 //write 8Byte
 
     ext.8b v21, v3, v3, #7 // [0][1][2][3][4][5]XY-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	v21, v22, v23, h21
-	st1 {v21.b}[0], [x2], x3 //write 9th Byte
+    FILTER_SINGLE_TAG_8BITS v21, v22, v23, h21
+    st1 {v21.b}[0], [x2], x3 //write 9th Byte
 
     sub x4, x4, #1
-	cbnz x4, w9_h_mc_luma_loop
+    cbnz x4, w9_h_mc_luma_loop
 WELS_ASM_ARCH64_FUNC_END
 
 
@@ -1863,12 +1863,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -1879,12 +1879,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 1 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 1 line
@@ -1895,12 +1895,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v8, v11, v14, v17, v2, v5, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v9, v12, v15, v18, v3, v6, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v10, v13, v16, v19, v4, v7, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 2 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 2 line
@@ -1911,12 +1911,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v11, v14, v17, v2, v5, v8, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v12, v15, v18, v3, v6, v9, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v13, v16, v19, v4, v7, v10, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 3 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 3 line
@@ -1927,12 +1927,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v14, v17, v2, v5, v8, v11, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v15, v18, v3, v6, v9, v12, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v16, v19, v4, v7, v10, v13, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 4 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 4 line
@@ -1943,12 +1943,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v17, v2, v5, v8, v11, v14, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v18, v3, v6, v9, v12, v15, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v19, v4, v7, v10, v13, v16, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 5 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 5 line
@@ -1959,12 +1959,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 6 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 6 line
@@ -1975,12 +1975,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v8, v11, v14, v17, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v6, v9, v12, v15, v18, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v7, v10, v13, v16, v19, v4, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15Byte : 7 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 7 line
@@ -2007,7 +2007,7 @@
     mov.16b v16, v30
 
     sub x4, x4, #8
-	cbnz x4, w17_hv_mc_luma_loop
+    cbnz x4, w17_hv_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v17.8b, v18.8b, v19.8b}, [x0], x1 // v17=src[3*stride]
@@ -2015,12 +2015,12 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v5, v8, v11, v14, v17, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v6, v9, v12, v15, v18, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     // vertical filtered into v21/v22
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v7, v10, v13, v16, v19, v22, v0, v1
-	UNPACK_2_16BITS_TO_ABC	v21, v22, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26	//output to v26[1]
+    UNPACK_2_16BITS_TO_ABC  v21, v22, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS2 v23, v24, v25, v26 //output to v26[1]
     st1 {v26.16b}, [x2], x5 //write 0:15 Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v22, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 16th Byte : 0 line
@@ -2061,8 +2061,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2073,8 +2073,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v3, v4, v5, v6, v7, v2, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v3, v4, v5, v6, v7, v2, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 1 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 1 line
@@ -2085,8 +2085,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v4, v5, v6, v7, v2, v3, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v4, v5, v6, v7, v2, v3, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 2 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 2 line
@@ -2097,8 +2097,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v5, v6, v7, v2, v3, v4, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v5, v6, v7, v2, v3, v4, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 3 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 3 line
@@ -2112,7 +2112,7 @@
     mov.16b v4, v30
 
     sub x4, x4, #4
-	cbnz x4, w9_hv_mc_luma_loop
+    cbnz x4, w9_hv_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2120,8 +2120,8 @@
     FILTER_6TAG_8BITS_TO_16BITS1 v2, v3, v4, v5, v6, v7, v20, v0, v1
     FILTER_6TAG_8BITS_TO_16BITS2 v2, v3, v4, v5, v6, v7, v21, v0, v1
     // horizon filtered
-	UNPACK_2_16BITS_TO_ABC	v20, v21, v23, v24, v25
-	FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26	//output to v26[0]
+    UNPACK_2_16BITS_TO_ABC  v20, v21, v23, v24, v25
+    FILTER_3_IN_16BITS_TO_8BITS1 v23, v24, v25, v26 //output to v26[0]
     st1 {v26.8b}, [x2], x5 //write 0:7Byte : 0 line
     UNPACK_FILTER_SINGLE_TAG_16BITS v26, v21, v29, v27, v28, d26
     st1 {v26.b}[0], [x2], x3 //write 8th Byte : 0 line
@@ -2207,7 +2207,7 @@
     mov.16b v4, v6
     mov.16b v6, v7
     sub x4, x4, #8
-	cbnz x4, w17_v_mc_luma_loop
+    cbnz x4, w17_v_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v7.16b}, [x0], x1 // v7=src[3*stride]
@@ -2262,7 +2262,7 @@
     mov.16b v6, v4
     mov.16b v4, v7
     sub x4, x4, #4
-	cbnz x4, w9_v_mc_luma_loop
+    cbnz x4, w9_v_mc_luma_loop
 
     //prfm pldl1strm, [x0, x1]
     ld1 {v7.8b}, [x0], x1 // v7=src[3*stride]
--- a/codec/common/x86/asm_inc.asm
+++ b/codec/common/x86/asm_inc.asm
@@ -44,15 +44,15 @@
 ;***********************************************************************
 
 %if 1
-	%define MOVDQ movdqa
+    %define MOVDQ movdqa
 %else
-	%define MOVDQ movdqu
+    %define MOVDQ movdqu
 %endif
 
 %if 1
-	%define WELSEMMS	emms
+    %define WELSEMMS emms
 %else
-	%define WELSEMMS
+    %define WELSEMMS
 %endif
 
 
@@ -220,7 +220,7 @@
 
 %macro LOAD_1_PARA 0
     %ifdef X86_32
-	mov r0, [esp + push_num*4 + 4]
+        mov r0, [esp + push_num*4 + 4]
     %endif
 %endmacro
 
@@ -234,8 +234,8 @@
 %macro LOAD_3_PARA 0
     %ifdef X86_32
         mov r0, [esp + push_num*4 + 4]
-	mov r1, [esp + push_num*4 + 8]
-	mov r2, [esp + push_num*4 + 12]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
     %endif
 %endmacro
 
@@ -267,7 +267,7 @@
 
 %macro LOAD_6_PARA 0
     %ifdef X86_32
-	push r3
+        push r3
         push r4
         push r5
         %assign  push_num push_num+3
@@ -310,7 +310,7 @@
 
 %macro LOAD_4_PARA_POP 0
     %ifdef X86_32
-	pop r3
+        pop r3
     %endif
 %endmacro
 
@@ -317,7 +317,7 @@
 %macro LOAD_5_PARA_POP 0
     %ifdef X86_32
         pop r4
-	pop r3
+        pop r3
     %endif
 %endmacro
 
@@ -324,8 +324,8 @@
 %macro LOAD_6_PARA_POP 0
     %ifdef X86_32
         pop r5
-  	pop r4
- 	pop r3
+        pop r4
+        pop r3
     %endif
 %endmacro
 
@@ -416,13 +416,13 @@
 
 %macro SIGN_EXTENSION 2
     %ifndef X86_32
-            movsxd %1, %2
+        movsxd %1, %2
     %endif
 %endmacro
 
 %macro SIGN_EXTENSIONW 2
     %ifndef X86_32
-            movsx %1, %2
+        movsx %1, %2
     %endif
 %endmacro
 
@@ -438,13 +438,13 @@
 %endmacro
 
 %macro WELS_AbsW 2
-	pxor        %2, %2
+    pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
 %endmacro
 
 %macro MMX_XSwap  4
-    movq		%4, %2
+    movq        %4, %2
     punpckh%1   %4, %3
     punpckl%1   %2, %3
 %endmacro
@@ -485,35 +485,35 @@
 ;in:  m1, m2, m3, m4, m5, m6, m7, m8
 ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
 %macro SSE2_TransTwo8x8B 9
-	movdqa	%9,	%8
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%9, %4
-	SSE2_XSawp bw,  %7, %6, %4
+    movdqa  %9,     %8
+    SSE2_XSawp bw,  %1, %2, %8
+    SSE2_XSawp bw,  %3, %4, %2
+    SSE2_XSawp bw,  %5, %6, %4
+    movdqa  %6, %9
+    movdqa  %9, %4
+    SSE2_XSawp bw,  %7, %6, %4
 
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %3
-	SSE2_XSawp wd,  %7, %4, %3
+    SSE2_XSawp wd,  %1, %3, %6
+    SSE2_XSawp wd,  %8, %2, %3
+    SSE2_XSawp wd,  %5, %7, %2
+    movdqa  %7, %9
+    movdqa  %9, %3
+    SSE2_XSawp wd,  %7, %4, %3
 
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %5
-	SSE2_XSawp dq,  %7, %3, %5
+    SSE2_XSawp dq,  %1, %5, %4
+    SSE2_XSawp dq,  %6, %2, %5
+    SSE2_XSawp dq,  %8, %7, %2
+    movdqa  %7, %9
+    movdqa  %9, %5
+    SSE2_XSawp dq,  %7, %3, %5
 
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %9
+    SSE2_XSawp qdq,  %1, %8, %3
+    SSE2_XSawp qdq,  %4, %2, %8
+    SSE2_XSawp qdq,  %6, %7, %2
+    movdqa  %7, %9
+    movdqa  %9, %1
+    SSE2_XSawp qdq,  %7, %5, %1
+    movdqa  %5, %9
 %endmacro
 
 ;xmm0, xmm6, xmm7, [eax], [ecx]
@@ -528,32 +528,32 @@
 
 ; m2 = m1 + m2, m1 = m1 - m2
 %macro SSE2_SumSub 3
-	movdqa  %3, %2
+    movdqa  %3, %2
     paddw   %2, %1
     psubw   %1, %3
 %endmacro
 
 
-%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%macro butterfly_1to16_sse      3       ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+    mov %3h, %3l
+    movd %1, e%3x           ; i.e, 1% = eax (=b0)
+    pshuflw %2, %1, 00h     ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+    pshufd %1, %2, 00h      ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
 %endmacro
 
 ;copy a dw into a xmm for 8 times
-%macro  SSE2_Copy8Times 2
-		movd	%1, %2
-		punpcklwd %1, %1
-		pshufd	%1,	%1,	0
+%macro SSE2_Copy8Times 2
+    movd    %1, %2
+    punpcklwd %1, %1
+    pshufd  %1,     %1,     0
 %endmacro
 
 ;copy a db into a xmm for 16 times
-%macro  SSE2_Copy16Times 2
-		movd		%1, %2
-		pshuflw		%1, %1, 0
-		punpcklqdq	%1, %1
-		packuswb	%1,	%1
+%macro SSE2_Copy16Times 2
+    movd            %1, %2
+    pshuflw         %1, %1, 0
+    punpcklqdq      %1, %1
+    packuswb        %1,     %1
 %endmacro
 
 
@@ -564,35 +564,35 @@
 ;dw 32,32,32,32,32,32,32,32 for xmm
 ;dw 32,32,32,32 for mm
 %macro WELS_DW32 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	psllw %1,5
+    pcmpeqw %1,%1
+    psrlw %1,15
+    psllw %1,5
 %endmacro
 
 ;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
 ;dw 1, 1, 1, 1 for mm
 %macro WELS_DW1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
+    pcmpeqw %1,%1
+    psrlw %1,15
 %endmacro
 
 ;all 0 for xmm and mm
-%macro	WELS_Zero 1
-	pxor %1, %1
+%macro WELS_Zero 1
+    pxor %1, %1
 %endmacro
 
 ;dd 1, 1, 1, 1 for xmm
 ;dd 1, 1 for mm
 %macro WELS_DD1 1
-	pcmpeqw %1,%1
-	psrld %1,31
+    pcmpeqw %1,%1
+    psrld %1,31
 %endmacro
 
 ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 %macro WELS_DB1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	packuswb %1,%1
+    pcmpeqw %1,%1
+    psrlw %1,15
+    packuswb %1,%1
 %endmacro
 
 
--- a/codec/common/x86/cpuid.asm
+++ b/codec/common/x86/cpuid.asm
@@ -29,13 +29,13 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	cpu_mmx.asm
+;*  cpu_mmx.asm
 ;*
 ;*  Abstract
-;*		verify cpuid feature support and cpuid detection
+;*      verify cpuid feature support and cpuid detection
 ;*
 ;*  History
-;*      04/29/2009	Created
+;*      04/29/2009  Created
 ;*
 ;*************************************************************************/
 
@@ -115,13 +115,13 @@
 %elifdef     X86_32
 
 WELS_EXTERN WelsCPUId
-    push	ebx
-    push	edi
+    push    ebx
+    push    edi
 
-    mov     eax, [esp+12]	; operating index
+    mov     eax, [esp+12]   ; operating index
     mov     edi, [esp+24]
     mov     ecx, [edi]
-    cpuid					; cpuid
+    cpuid                   ; cpuid
 
     ; processing various information return
     mov     edi, [esp+16]
@@ -133,7 +133,7 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-    pop	    edi
+    pop     edi
     pop     ebx
     ret
 
@@ -145,31 +145,31 @@
 ;****************************************************************************************************
 WELS_EXTERN WelsCPUSupportAVX
 %ifdef     WIN64
-        mov   eax,    ecx
-        mov   ecx,    edx
+    mov   eax,    ecx
+    mov   ecx,    edx
 %elifdef   UNIX64
-        mov eax, edi
-        mov ecx, esi
+    mov eax, edi
+    mov ecx, esi
 %else
-        mov eax, [esp+4]
-        mov ecx, [esp+8]
+    mov eax, [esp+4]
+    mov ecx, [esp+8]
 %endif
 
-        ; refer to detection of AVX addressed in INTEL AVX manual document
-        and ecx, 018000000H
-        cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
-        jne avx_not_supported
-        ; processor supports AVX instructions and XGETBV is enabled by OS
-        mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
-        XGETBV                                  ; result in EDX:EAX
-        and eax, 06H
-        cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
-        jne avx_not_supported
-        mov eax, 1
-        ret
+    ; refer to detection of AVX addressed in INTEL AVX manual document
+    and ecx, 018000000H
+    cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
+    jne avx_not_supported
+    ; processor supports AVX instructions and XGETBV is enabled by OS
+    mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
+    XGETBV                                  ; result in EDX:EAX
+    and eax, 06H
+    cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
+    jne avx_not_supported
+    mov eax, 1
+    ret
 avx_not_supported:
-        mov eax, 0
-        ret
+    mov eax, 0
+    ret
 
 
 ; need call after cpuid=1 and eax, ecx flag got then
@@ -178,35 +178,35 @@
 ;****************************************************************************************************
 WELS_EXTERN  WelsCPUSupportFMA
 %ifdef     WIN64
-        mov   eax,   ecx
-        mov   ecx,   edx
+    mov   eax,   ecx
+    mov   ecx,   edx
 %elifdef   UNIX64
-        mov   eax,   edi
-        mov   ecx,   esi
+    mov   eax,   edi
+    mov   ecx,   esi
 %else
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
+    mov eax, [esp+4]
+    mov ecx, [esp+8]
 %endif
-	; refer to detection of FMA addressed in INTEL AVX manual document
-	and ecx, 018001000H
-	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
-	jne fma_not_supported
-	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne fma_not_supported
-	mov eax, 1
-	ret
+    ; refer to detection of FMA addressed in INTEL AVX manual document
+    and ecx, 018001000H
+    cmp ecx, 018001000H     ; check OSXSAVE, AVX, FMA feature flags
+    jne fma_not_supported
+    ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+    mov ecx, 0              ; specify 0 for XFEATURE_ENABLED_MASK register
+    XGETBV                  ; result in EDX:EAX
+    and eax, 06H
+    cmp eax, 06H            ; check OS has enabled both XMM and YMM state support
+    jne fma_not_supported
+    mov eax, 1
+    ret
 fma_not_supported:
-	mov eax, 0
-	ret
+    mov eax, 0
+    ret
 
 ;******************************************************************************************
 ;   void WelsEmms()
 ;******************************************************************************************
 WELS_EXTERN WelsEmms
-	emms	; empty mmx technology states
-	ret
+    emms    ; empty mmx technology states
+    ret
 
--- a/codec/common/x86/deblock.asm
+++ b/codec/common/x86/deblock.asm
@@ -57,1032 +57,1032 @@
 
 
 WELS_EXTERN   DeblockLumaLt4V_ssse3
-  push        rbp
-  mov         r11,[rsp + 16 + 20h]  ; pTC
-  PUSH_XMM 16
-  sub         rsp,1B0h
-  lea         rbp,[rsp+20h]
-  movd        xmm4,r8d
-  movd        xmm2,r9d
-  mov         qword [rbp+180h],r12
-  mov         r10,rcx
-  movsxd      r12,edx
-  add         edx,edx
-  movsxd      rdx,edx
-  sub         r10,r12
-  movsx       r8d,byte [r11]
-  pxor        xmm3,xmm3
-  punpcklwd   xmm2,xmm2
-  movaps      [rbp+50h],xmm14
-  lea         rax,[r12+r12*2]
-  movdqa      xmm14,[rdx+rcx]
-  neg         rax
-  pshufd      xmm0,xmm2,0
-  movd        xmm2,r8d
-  movsx       edx,byte [r11+1]
-  movsx       r8d,byte [r11+2]
-  movsx       r11d,byte [r11+3]
-  movaps      [rbp+70h],xmm12
-  movd        xmm1,edx
-  movaps      [rbp+80h],xmm11
-  movd        xmm12,r8d
-  movd        xmm11,r11d
-  movdqa      xmm5, [rax+rcx]
-  lea         rax,[r12+r12]
-  punpcklwd   xmm12,xmm12
-  neg         rax
-  punpcklwd   xmm11,xmm11
-  movaps      [rbp],xmm8
-  movdqa      xmm8, [r10]
-  punpcklwd   xmm2,xmm2
-  punpcklwd   xmm1,xmm1
-  punpcklqdq  xmm12,xmm12
-  punpcklqdq  xmm11,xmm11
-  punpcklqdq  xmm2,xmm2
-  punpcklqdq  xmm1,xmm1
-  shufps      xmm12,xmm11,88h
-  movdqa      xmm11,xmm8
-  movaps      [rbp+30h],xmm9
-  movdqa      xmm9,[rcx]
-  shufps      xmm2,xmm1,88h
-  movdqa      xmm1,xmm5
-  punpcklbw   xmm11,xmm3
-  movaps      [rbp+20h],xmm6
-  movaps      [rbp+60h],xmm13
-  movdqa      xmm13,xmm11
-  movaps      [rbp+90h],xmm10
-  movdqa      xmm10,xmm9
-  movdqa      xmm6,[rax+rcx]
-  punpcklbw   xmm1,xmm3
-  movaps      [rbp+0A0h],xmm12
-  psubw       xmm13,xmm1
-  movaps      [rbp+40h],xmm15
-  movdqa      xmm15,xmm14
-  movaps      [rbp+10h],xmm7
-  movdqa      xmm7,xmm6
-  punpcklbw   xmm10,xmm3
-  movdqa      xmm12,[r12+rcx]
-  punpcklbw   xmm7,xmm3
-  punpcklbw   xmm12,xmm3
-  punpcklbw   xmm15,xmm3
-  pabsw       xmm3,xmm13
-  movdqa      xmm13,xmm10
-  psubw       xmm13,xmm15
-  movdqa      [rbp+0F0h],xmm15
-  pabsw       xmm15,xmm13
-  movdqa      xmm13,xmm11
-  movdqa      [rbp+0B0h],xmm1
-  movdqa      xmm1,xmm0
-  pavgw       xmm13,xmm10
-  pcmpgtw     xmm1,xmm3
-  movdqa      [rbp+120h],xmm13
-  movaps      xmm13,xmm2
-  punpcklwd   xmm4,xmm4
-  movdqa      xmm3,xmm0
-  movdqa      [rbp+100h],xmm1
-  psubw       xmm13,xmm1
-  movdqa      xmm1,xmm10
-  pcmpgtw     xmm3,xmm15
-  pshufd      xmm4,xmm4,0
-  psubw       xmm1,xmm11
-  movdqa      [rbp+0D0h],xmm10
-  psubw       xmm13,xmm3
-  movdqa      [rbp+110h],xmm3
-  pabsw       xmm15,xmm1
-  movdqa      xmm3,xmm4
-  psubw       xmm10,xmm12
-  pcmpgtw     xmm3,xmm15
-  pabsw       xmm15,xmm10
-  movdqa      xmm10,xmm0
-  psllw       xmm1,2
-  movdqa      [rbp+0C0h],xmm11
-  psubw       xmm11,xmm7
-  pcmpgtw     xmm10,xmm15
-  pabsw       xmm11,xmm11
-  movdqa      xmm15,xmm0
-  pand        xmm3,xmm10
-  pcmpgtw     xmm15,xmm11
-  movaps      xmm11,xmm2
-  pxor        xmm10,xmm10
-  pand        xmm3,xmm15
-  pcmpgtw     xmm11,xmm10
-  pcmpeqw     xmm10,xmm2
-  por         xmm11,xmm10
-  pand        xmm3,xmm11
-  movdqa      xmm11,xmm7
-  psubw       xmm11,xmm12
-  pxor        xmm15,xmm15
-  paddw       xmm11,xmm1
-  psubw       xmm15,xmm13
-  movdqa      [rbp+0E0h],xmm12
-  paddw       xmm11,[FOUR_16B_SSE2]
-  pxor        xmm12,xmm12
-  psraw       xmm11,3
-  punpckhbw   xmm8,xmm12
-  pmaxsw      xmm15,xmm11
-  punpckhbw   xmm5,xmm12
-  movdqa      xmm11,xmm8
-  pminsw      xmm13,xmm15
-  psubw       xmm11,xmm5
-  punpckhbw   xmm9,xmm12
-  pand        xmm13,xmm3
-  movdqa      [rbp+130h],xmm13
-  pabsw       xmm13,xmm11
-  punpckhbw   xmm14,xmm12
-  movdqa      xmm11,xmm9
-  psubw       xmm11,xmm14
-  movdqa      xmm15,xmm0
-  movdqa      [rbp+140h],xmm14
-  pabsw       xmm14,xmm11
-  movdqa      xmm11,xmm8
-  pcmpgtw     xmm15,xmm14
-  movdqa      xmm1,[r12+rcx]
-  pavgw       xmm11,xmm9
-  movdqa      [rbp+170h],xmm11
-  movdqa      xmm10,xmm9
-  punpckhbw   xmm6,xmm12
-  psubw       xmm10,xmm8
-  punpckhbw   xmm1,xmm12
-  movdqa      xmm12,xmm0
-  movaps      xmm11,[rbp+0A0h]
-  pcmpgtw     xmm12,xmm13
-  movaps      xmm13,xmm11
-  psubw       xmm13,xmm12
-  movdqa      [rbp+160h],xmm15
-  psubw       xmm13,xmm15
-  movdqa      xmm15,xmm9
-  psubw       xmm15,xmm1
-  movdqa      [rbp+150h],xmm12
-  pabsw       xmm12,xmm10
-  pabsw       xmm14,xmm15
-  movdqa      xmm15,xmm8
-  pcmpgtw     xmm4,xmm12
-  movdqa      xmm12,xmm0
-  psubw       xmm15,xmm6
-  pcmpgtw     xmm12,xmm14
-  pabsw       xmm14,xmm15
-  psllw       xmm10,2
-  pcmpgtw     xmm0,xmm14
-  movdqa      xmm14,xmm6
-  psubw       xmm14,xmm1
-  pand        xmm4,xmm12
-  paddw       xmm14,xmm10
-  pand        xmm4,xmm0
-  paddw       xmm14,[FOUR_16B_SSE2]
-  pxor        xmm15,xmm15
-  movaps      xmm12,xmm11
-  psubw       xmm15,xmm13
-  pxor        xmm0,xmm0
-  psraw       xmm14,3
-  pcmpgtw     xmm12,xmm0
-  pcmpeqw     xmm0,xmm11
-  pmaxsw      xmm15,xmm14
-  por         xmm12,xmm0
-  movdqa      xmm0,[rbp+120h]
-  pminsw      xmm13,xmm15
-  movdqa      xmm15,[rbp+0B0h]
-  movdqa      xmm10,xmm7
-  pand        xmm4,xmm12
-  paddw       xmm15,xmm0
-  pxor        xmm12,xmm12
-  paddw       xmm10,xmm7
-  movdqa      xmm14,xmm12
-  psubw       xmm15,xmm10
-  psubw       xmm14,xmm2
-  psraw       xmm15,1
-  pmaxsw      xmm15,xmm14
-  movdqa      xmm10,xmm6
-  pminsw      xmm15,xmm2
-  paddw       xmm10,xmm6
-  pand        xmm15,xmm3
-  psubw       xmm12,xmm11
-  pand        xmm15,[rbp+100h]
-  pand        xmm13,xmm4
-  paddw       xmm7,xmm15
-  paddw       xmm8,xmm13
-  movdqa      xmm15,[rbp+170h]
-  psubw       xmm9,xmm13
-  paddw       xmm5,xmm15
-  psubw       xmm5,xmm10
-  psraw       xmm5,1
-  pmaxsw      xmm5,xmm12
-  pminsw      xmm5,xmm11
-  pand        xmm5,xmm4
-  pand        xmm5,[rbp+150h]
-  paddw       xmm6,xmm5
-  movdqa      xmm5,[rbp+0C0h]
-  packuswb    xmm7,xmm6
-  movdqa      xmm6,[rbp+130h]
-  paddw       xmm5,xmm6
-  packuswb    xmm5,xmm8
-  movdqa      xmm8,[rbp+0D0h]
-  psubw       xmm8,xmm6
-  movdqa      xmm6,[rbp+0F0h]
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[rbp+0E0h]
-  packuswb    xmm8,xmm9
-  movdqa      xmm9,xmm0
-  paddw       xmm9,xmm0
-  psubw       xmm6,xmm9
-  psraw       xmm6,1
-  pmaxsw      xmm14,xmm6
-  pminsw      xmm2,xmm14
-  pand        xmm2,xmm3
-  pand        xmm2,[rbp+110h]
-  paddw       xmm0,xmm2
-  movdqa      xmm2,[rbp+140h]
-  paddw       xmm2,xmm15
-  movdqa      xmm15,xmm1
-  paddw       xmm15,xmm1
-  psubw       xmm2,xmm15
-  psraw       xmm2,1
-  pmaxsw      xmm12,xmm2
-  pminsw      xmm11,xmm12
-  pand        xmm11,xmm4
-  pand        xmm11,[rbp+160h]
-  paddw       xmm1,xmm11
-  movdqa      [rax+rcx],xmm7
-  movdqa      [r10],xmm5
-  packuswb    xmm0,xmm1
-  movdqa      [rcx],xmm8
-  movdqa      [r12+rcx],xmm0
-  mov         r12,qword [rbp+180h]
-  lea         rsp,[rbp+190h]
-  POP_XMM
-  pop         rbp
-  ret
+    push        rbp
+    mov         r11,[rsp + 16 + 20h]  ; pTC
+    PUSH_XMM 16
+    sub         rsp,1B0h
+    lea         rbp,[rsp+20h]
+    movd        xmm4,r8d
+    movd        xmm2,r9d
+    mov         qword [rbp+180h],r12
+    mov         r10,rcx
+    movsxd      r12,edx
+    add         edx,edx
+    movsxd      rdx,edx
+    sub         r10,r12
+    movsx       r8d,byte [r11]
+    pxor        xmm3,xmm3
+    punpcklwd   xmm2,xmm2
+    movaps      [rbp+50h],xmm14
+    lea         rax,[r12+r12*2]
+    movdqa      xmm14,[rdx+rcx]
+    neg         rax
+    pshufd      xmm0,xmm2,0
+    movd        xmm2,r8d
+    movsx       edx,byte [r11+1]
+    movsx       r8d,byte [r11+2]
+    movsx       r11d,byte [r11+3]
+    movaps      [rbp+70h],xmm12
+    movd        xmm1,edx
+    movaps      [rbp+80h],xmm11
+    movd        xmm12,r8d
+    movd        xmm11,r11d
+    movdqa      xmm5, [rax+rcx]
+    lea         rax,[r12+r12]
+    punpcklwd   xmm12,xmm12
+    neg         rax
+    punpcklwd   xmm11,xmm11
+    movaps      [rbp],xmm8
+    movdqa      xmm8, [r10]
+    punpcklwd   xmm2,xmm2
+    punpcklwd   xmm1,xmm1
+    punpcklqdq  xmm12,xmm12
+    punpcklqdq  xmm11,xmm11
+    punpcklqdq  xmm2,xmm2
+    punpcklqdq  xmm1,xmm1
+    shufps      xmm12,xmm11,88h
+    movdqa      xmm11,xmm8
+    movaps      [rbp+30h],xmm9
+    movdqa      xmm9,[rcx]
+    shufps      xmm2,xmm1,88h
+    movdqa      xmm1,xmm5
+    punpcklbw   xmm11,xmm3
+    movaps      [rbp+20h],xmm6
+    movaps      [rbp+60h],xmm13
+    movdqa      xmm13,xmm11
+    movaps      [rbp+90h],xmm10
+    movdqa      xmm10,xmm9
+    movdqa      xmm6,[rax+rcx]
+    punpcklbw   xmm1,xmm3
+    movaps      [rbp+0A0h],xmm12
+    psubw       xmm13,xmm1
+    movaps      [rbp+40h],xmm15
+    movdqa      xmm15,xmm14
+    movaps      [rbp+10h],xmm7
+    movdqa      xmm7,xmm6
+    punpcklbw   xmm10,xmm3
+    movdqa      xmm12,[r12+rcx]
+    punpcklbw   xmm7,xmm3
+    punpcklbw   xmm12,xmm3
+    punpcklbw   xmm15,xmm3
+    pabsw       xmm3,xmm13
+    movdqa      xmm13,xmm10
+    psubw       xmm13,xmm15
+    movdqa      [rbp+0F0h],xmm15
+    pabsw       xmm15,xmm13
+    movdqa      xmm13,xmm11
+    movdqa      [rbp+0B0h],xmm1
+    movdqa      xmm1,xmm0
+    pavgw       xmm13,xmm10
+    pcmpgtw     xmm1,xmm3
+    movdqa      [rbp+120h],xmm13
+    movaps      xmm13,xmm2
+    punpcklwd   xmm4,xmm4
+    movdqa      xmm3,xmm0
+    movdqa      [rbp+100h],xmm1
+    psubw       xmm13,xmm1
+    movdqa      xmm1,xmm10
+    pcmpgtw     xmm3,xmm15
+    pshufd      xmm4,xmm4,0
+    psubw       xmm1,xmm11
+    movdqa      [rbp+0D0h],xmm10
+    psubw       xmm13,xmm3
+    movdqa      [rbp+110h],xmm3
+    pabsw       xmm15,xmm1
+    movdqa      xmm3,xmm4
+    psubw       xmm10,xmm12
+    pcmpgtw     xmm3,xmm15
+    pabsw       xmm15,xmm10
+    movdqa      xmm10,xmm0
+    psllw       xmm1,2
+    movdqa      [rbp+0C0h],xmm11
+    psubw       xmm11,xmm7
+    pcmpgtw     xmm10,xmm15
+    pabsw       xmm11,xmm11
+    movdqa      xmm15,xmm0
+    pand        xmm3,xmm10
+    pcmpgtw     xmm15,xmm11
+    movaps      xmm11,xmm2
+    pxor        xmm10,xmm10
+    pand        xmm3,xmm15
+    pcmpgtw     xmm11,xmm10
+    pcmpeqw     xmm10,xmm2
+    por         xmm11,xmm10
+    pand        xmm3,xmm11
+    movdqa      xmm11,xmm7
+    psubw       xmm11,xmm12
+    pxor        xmm15,xmm15
+    paddw       xmm11,xmm1
+    psubw       xmm15,xmm13
+    movdqa      [rbp+0E0h],xmm12
+    paddw       xmm11,[FOUR_16B_SSE2]
+    pxor        xmm12,xmm12
+    psraw       xmm11,3
+    punpckhbw   xmm8,xmm12
+    pmaxsw      xmm15,xmm11
+    punpckhbw   xmm5,xmm12
+    movdqa      xmm11,xmm8
+    pminsw      xmm13,xmm15
+    psubw       xmm11,xmm5
+    punpckhbw   xmm9,xmm12
+    pand        xmm13,xmm3
+    movdqa      [rbp+130h],xmm13
+    pabsw       xmm13,xmm11
+    punpckhbw   xmm14,xmm12
+    movdqa      xmm11,xmm9
+    psubw       xmm11,xmm14
+    movdqa      xmm15,xmm0
+    movdqa      [rbp+140h],xmm14
+    pabsw       xmm14,xmm11
+    movdqa      xmm11,xmm8
+    pcmpgtw     xmm15,xmm14
+    movdqa      xmm1,[r12+rcx]
+    pavgw       xmm11,xmm9
+    movdqa      [rbp+170h],xmm11
+    movdqa      xmm10,xmm9
+    punpckhbw   xmm6,xmm12
+    psubw       xmm10,xmm8
+    punpckhbw   xmm1,xmm12
+    movdqa      xmm12,xmm0
+    movaps      xmm11,[rbp+0A0h]
+    pcmpgtw     xmm12,xmm13
+    movaps      xmm13,xmm11
+    psubw       xmm13,xmm12
+    movdqa      [rbp+160h],xmm15
+    psubw       xmm13,xmm15
+    movdqa      xmm15,xmm9
+    psubw       xmm15,xmm1
+    movdqa      [rbp+150h],xmm12
+    pabsw       xmm12,xmm10
+    pabsw       xmm14,xmm15
+    movdqa      xmm15,xmm8
+    pcmpgtw     xmm4,xmm12
+    movdqa      xmm12,xmm0
+    psubw       xmm15,xmm6
+    pcmpgtw     xmm12,xmm14
+    pabsw       xmm14,xmm15
+    psllw       xmm10,2
+    pcmpgtw     xmm0,xmm14
+    movdqa      xmm14,xmm6
+    psubw       xmm14,xmm1
+    pand        xmm4,xmm12
+    paddw       xmm14,xmm10
+    pand        xmm4,xmm0
+    paddw       xmm14,[FOUR_16B_SSE2]
+    pxor        xmm15,xmm15
+    movaps      xmm12,xmm11
+    psubw       xmm15,xmm13
+    pxor        xmm0,xmm0
+    psraw       xmm14,3
+    pcmpgtw     xmm12,xmm0
+    pcmpeqw     xmm0,xmm11
+    pmaxsw      xmm15,xmm14
+    por         xmm12,xmm0
+    movdqa      xmm0,[rbp+120h]
+    pminsw      xmm13,xmm15
+    movdqa      xmm15,[rbp+0B0h]
+    movdqa      xmm10,xmm7
+    pand        xmm4,xmm12
+    paddw       xmm15,xmm0
+    pxor        xmm12,xmm12
+    paddw       xmm10,xmm7
+    movdqa      xmm14,xmm12
+    psubw       xmm15,xmm10
+    psubw       xmm14,xmm2
+    psraw       xmm15,1
+    pmaxsw      xmm15,xmm14
+    movdqa      xmm10,xmm6
+    pminsw      xmm15,xmm2
+    paddw       xmm10,xmm6
+    pand        xmm15,xmm3
+    psubw       xmm12,xmm11
+    pand        xmm15,[rbp+100h]
+    pand        xmm13,xmm4
+    paddw       xmm7,xmm15
+    paddw       xmm8,xmm13
+    movdqa      xmm15,[rbp+170h]
+    psubw       xmm9,xmm13
+    paddw       xmm5,xmm15
+    psubw       xmm5,xmm10
+    psraw       xmm5,1
+    pmaxsw      xmm5,xmm12
+    pminsw      xmm5,xmm11
+    pand        xmm5,xmm4
+    pand        xmm5,[rbp+150h]
+    paddw       xmm6,xmm5
+    movdqa      xmm5,[rbp+0C0h]
+    packuswb    xmm7,xmm6
+    movdqa      xmm6,[rbp+130h]
+    paddw       xmm5,xmm6
+    packuswb    xmm5,xmm8
+    movdqa      xmm8,[rbp+0D0h]
+    psubw       xmm8,xmm6
+    movdqa      xmm6,[rbp+0F0h]
+    paddw       xmm6,xmm0
+    movdqa      xmm0,[rbp+0E0h]
+    packuswb    xmm8,xmm9
+    movdqa      xmm9,xmm0
+    paddw       xmm9,xmm0
+    psubw       xmm6,xmm9
+    psraw       xmm6,1
+    pmaxsw      xmm14,xmm6
+    pminsw      xmm2,xmm14
+    pand        xmm2,xmm3
+    pand        xmm2,[rbp+110h]
+    paddw       xmm0,xmm2
+    movdqa      xmm2,[rbp+140h]
+    paddw       xmm2,xmm15
+    movdqa      xmm15,xmm1
+    paddw       xmm15,xmm1
+    psubw       xmm2,xmm15
+    psraw       xmm2,1
+    pmaxsw      xmm12,xmm2
+    pminsw      xmm11,xmm12
+    pand        xmm11,xmm4
+    pand        xmm11,[rbp+160h]
+    paddw       xmm1,xmm11
+    movdqa      [rax+rcx],xmm7
+    movdqa      [r10],xmm5
+    packuswb    xmm0,xmm1
+    movdqa      [rcx],xmm8
+    movdqa      [r12+rcx],xmm0
+    mov         r12,qword [rbp+180h]
+    lea         rsp,[rbp+190h]
+    POP_XMM
+    pop         rbp
+    ret
 
 
 WELS_EXTERN   DeblockLumaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        rsi
-  push        rdi
-  sub         rsp,1D8h
-  movaps      [rax-38h],xmm6
-  movaps      [rax-48h],xmm7
-  movaps      [rax-58h],xmm8
-  pxor        xmm1,xmm1
-  movsxd      r10,edx
-  mov         rbp,rcx
-  mov         r11d,r8d
-  mov         rdx,rcx
-  mov         rdi,rbp
-  mov         rbx,rbp
-  movdqa      xmm5,[rbp]
-  movaps      [rax-68h],xmm9
-  movaps      [rax-78h],xmm10
-  punpcklbw   xmm5,xmm1
-  movaps      [rax-88h],xmm11
-  movaps      [rax-98h],xmm12
-  movaps      [rax-0A8h],xmm13
-  movaps      [rax-0B8h],xmm14
-  movdqa      xmm14,[r10+rbp]
-  movaps      [rax-0C8h],xmm15
-  lea         eax,[r10*4]
-  movsxd      r8,eax
-  lea         eax,[r10+r10*2]
-  movsxd      rcx,eax
-  lea         eax,[r10+r10]
-  sub         rdx,r8
-  punpcklbw   xmm14,xmm1
-  movdqa      [rsp+90h],xmm5
-  movdqa      [rsp+30h],xmm14
-  movsxd      rsi,eax
-  movsx       eax,r11w
-  sub         rdi,rcx
-  sub         rbx,rsi
-  mov         r8,rbp
-  sub         r8,r10
-  movd        xmm0,eax
-  movsx       eax,r9w
-  movdqa      xmm12,[rdi]
-  movdqa      xmm6, [rsi+rbp]
-  movdqa      xmm13,[rbx]
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm6,xmm1
-  movdqa      xmm8,[r8]
-  movd        xmm0,eax
-  movdqa      xmm10,xmm11
-  mov         eax,2
-  punpcklbw   xmm8,xmm1
-  punpcklbw   xmm12,xmm1
-  cwde
-  punpcklwd   xmm0,xmm0
-  psraw       xmm10,2
-  movdqa      xmm1,xmm8
-  movdqa      [rsp+0F0h],xmm13
-  movdqa      [rsp+0B0h],xmm8
-  pshufd      xmm7,xmm0,0
-  psubw       xmm1,xmm13
-  movdqa      xmm0,xmm5
-  movdqa      xmm4,xmm7
-  movdqa      xmm2,xmm7
-  psubw       xmm0,xmm8
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm5
-  movdqa      [rsp+40h],xmm7
-  movdqa      [rsp+60h],xmm6
-  pcmpgtw     xmm4,xmm0
-  psubw       xmm1,xmm14
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm4,xmm2
-  movdqa      xmm0,xmm11
-  pcmpgtw     xmm0,xmm3
-  pand        xmm4,xmm0
-  movd        xmm0,eax
-  movdqa      [rsp+20h],xmm4
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm2,xmm0,0
-  paddw       xmm10,xmm2
-  movdqa      [rsp+0A0h],xmm2
-  movdqa      xmm15,xmm7
-  pxor        xmm4,xmm4
-  movdqa      xmm0,xmm8
-  psubw       xmm0,xmm12
-  mov         eax,4
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm10
-  cwde
-  pcmpgtw     xmm15,xmm0
-  pcmpgtw     xmm1,xmm3
-  movdqa      xmm3,xmm7
-  movdqa      xmm7,[rdx]
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm6
-  pand        xmm15,xmm1
-  punpcklbw   xmm7,xmm4
-  movdqa      xmm9,xmm15
-  pabsw       xmm0,xmm0
-  psllw       xmm7,1
-  pandn       xmm9,xmm12
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm7,xmm12
-  movd        xmm0,eax
-  pand        xmm3,xmm1
-  paddw       xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  paddw       xmm7,xmm12
-  pshufd      xmm1,xmm0,0
-  paddw       xmm7,xmm13
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm6
-  paddw       xmm7,xmm8
-  movdqa      [rsp+70h],xmm1
-  paddw       xmm7,xmm5
-  movdqa      [rsp+120h],xmm0
-  movdqa      xmm0,[rcx+rbp]
-  punpcklbw   xmm0,xmm4
-  paddw       xmm7,xmm1
-  movdqa      xmm4,xmm15
-  psllw       xmm0,1
-  psraw       xmm7,3
-  paddw       xmm0,xmm6
-  pand        xmm7,xmm15
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm14
-  movdqa      xmm6,xmm15
-  paddw       xmm0,xmm5
-  pandn       xmm6,xmm13
-  paddw       xmm0,xmm8
-  paddw       xmm0,xmm1
-  psraw       xmm0,3
-  movdqa      xmm1,xmm12
-  paddw       xmm1,xmm13
-  pand        xmm0,xmm3
-  movdqa      [rsp+100h],xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,xmm5
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm3
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pandn       xmm0,xmm14
-  pand        xmm4,xmm1
-  movdqa      [rsp+0E0h],xmm0
-  movdqa      xmm0,xmm5
-  paddw       xmm0,xmm8
-  movdqa      xmm1,[rsp+60h]
-  paddw       xmm1,xmm14
-  movdqa      xmm14,xmm3
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,[rsp+30h]
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pand        xmm14,xmm1
-  movdqa      xmm1,xmm13
-  paddw       xmm1,xmm13
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  movdqa      xmm0,[rsp+30h]
-  movdqa      xmm2,xmm13
-  movdqa      xmm5,xmm15
-  paddw       xmm0,[rsp+70h]
-  pandn       xmm5,xmm1
-  paddw       xmm2,xmm8
-  movdqa      xmm8,[rsp+90h]
-  movdqa      xmm1,xmm12
-  paddw       xmm2,xmm8
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,xmm8
-  movdqa      xmm8,xmm3
-  movdqa      xmm2,[rsp+30h]
-  paddw       xmm0,xmm13
-  psraw       xmm1,3
-  pand        xmm15,xmm1
-  movdqa      xmm1,xmm2
-  paddw       xmm1,xmm2
-  paddw       xmm2,[rsp+90h]
-  paddw       xmm2,[rsp+0B0h]
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  movdqa      xmm13,[r8]
-  paddw       xmm0, [rsp+70h]
-  paddw       xmm1, [rsp+0A0h]
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  psraw       xmm1,2
-  movdqa      xmm0, [rdi]
-  pandn       xmm8,xmm1
-  movdqa      xmm1, [rsp+60h]
-  paddw       xmm1,xmm2
-  movdqa      xmm2, [rbx]
-  psraw       xmm1,3
-  pand        xmm3,xmm1
-  movdqa      xmm1, [rbp]
-  movdqa      [rsp+0D0h],xmm3
-  pxor        xmm3,xmm3
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm1,xmm3
-  punpckhbw   xmm13,xmm3
-  movdqa      [rsp+0C0h],xmm0
-  movdqa      xmm0,[r10+rbp]
-  movdqa      [rsp],xmm1
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm2,xmm3
-  movdqa      [rsp+80h],xmm0
-  movdqa      xmm0,[rsi+rbp]
-  movdqa      [rsp+10h],xmm13
-  punpckhbw   xmm0,xmm3
-  movdqa      [rsp+50h],xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm1,xmm13
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm2
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,[rsp]
-  movdqa      xmm13,[rsp+40h]
-  movdqa      [rsp+110h],xmm2
-  psubw       xmm1, [rsp+80h]
-  pcmpgtw     xmm13,xmm0
-  pcmpgtw     xmm11,xmm3
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm10,xmm3
-  movdqa      xmm1, [rsp+40h]
-  movdqa      xmm2,xmm1
-  movdqa      xmm3,xmm1
-  pcmpgtw     xmm2,xmm0
-  movdqa      xmm0, [rsp+10h]
-  pand        xmm13,xmm2
-  pand        xmm13,xmm11
-  movdqa      xmm11,[rsp+0C0h]
-  psubw       xmm0,xmm11
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm3,xmm0
-  pand        xmm3,xmm10
-  movdqa      xmm0,[rsp]
-  psubw       xmm0,[rsp+50h]
-  movdqa      xmm2,[rdx]
-  pabsw       xmm0,xmm0
-  por         xmm7,xmm9
-  movdqa      xmm9,[rsp+20h]
-  pcmpgtw     xmm1,xmm0
-  pand        xmm9,xmm7
-  movdqa      xmm7,[rsp+20h]
-  movdqa      xmm0,xmm7
-  pandn       xmm0,xmm12
-  movdqa      xmm12,[rsp+110h]
-  pand        xmm1,xmm10
-  movdqa      xmm10,[rsp+70h]
-  movdqa      [rsp+40h],xmm1
-  movdqa      xmm1,xmm13
-  por         xmm9,xmm0
-  pxor        xmm0,xmm0
-  por         xmm4,xmm6
-  movdqa      xmm6,xmm7
-  punpckhbw   xmm2,xmm0
-  por         xmm15,xmm5
-  movdqa      xmm5,[rsp+20h]
-  movdqa      xmm0,xmm3
-  psllw       xmm2,1
-  pandn       xmm0,xmm11
-  pand        xmm6,xmm4
-  movdqa      xmm4,[rsp]
-  paddw       xmm2,xmm11
-  pand        xmm5,xmm15
-  movdqa      xmm15,[rsp+20h]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm12
-  paddw       xmm2,[rsp+10h]
-  paddw       xmm2,[rsp]
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  pand        xmm2,xmm3
-  por         xmm2,xmm0
-  pand        xmm1,xmm2
-  movdqa      xmm0,xmm13
-  movdqa      xmm2,xmm11
-  pandn       xmm0,xmm11
-  paddw       xmm2,xmm12
-  por         xmm1,xmm0
-  packuswb    xmm9,xmm1
-  movdqa      xmm0,xmm7
-  movdqa      xmm7,[rsp+0A0h]
-  pandn       xmm0,[rsp+0F0h]
-  movdqa      xmm1,xmm3
-  por         xmm6,xmm0
-  movdqa      xmm0,[rsp+10h]
-  paddw       xmm0,xmm4
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm12
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  pandn       xmm0,xmm12
-  movdqa      xmm1,xmm12
-  paddw       xmm1,[rsp+10h]
-  por         xmm2,xmm0
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+0B0h]
-  paddw       xmm1,xmm4
-  packuswb    xmm6,xmm2
-  movdqa      xmm2,xmm3
-  psllw       xmm1,1
-  por         xmm5,xmm0
-  movdqa      xmm0,[rsp+80h]
-  paddw       xmm0,xmm10
-  paddw       xmm1,xmm0
-  paddw       xmm11,xmm1
-  psraw       xmm11,3
-  movdqa      xmm1,xmm12
-  pand        xmm2,xmm11
-  paddw       xmm1,xmm12
-  movdqa      xmm11,[rsp+80h]
-  movdqa      xmm0, [rsp+10h]
-  por         xmm14,[rsp+0E0h]
-  paddw       xmm0,xmm11
-  movdqa      xmm4,xmm15
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  paddw       xmm1,xmm7
-  psraw       xmm1,2
-  pandn       xmm3,xmm1
-  por         xmm2,xmm3
-  movdqa      xmm1,xmm13
-  movdqa      xmm3,[rsp+10h]
-  pandn       xmm0,xmm3
-  pand        xmm1,xmm2
-  movdqa      xmm2,xmm11
-  paddw       xmm2,[rsp]
-  por         xmm1,xmm0
-  movdqa      xmm0,[rsp+0D0h]
-  por         xmm0,xmm8
-  paddw       xmm2,xmm3
-  packuswb    xmm5,xmm1
-  movdqa      xmm8,[rsp+40h]
-  movdqa      xmm1,[rsp+50h]
-  movdqa      xmm3,xmm8
-  pand        xmm4,xmm0
-  psllw       xmm2,1
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+90h]
-  por         xmm4,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm10
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,[rsp]
-  movdqa      xmm2,xmm11
-  paddw       xmm0,xmm12
-  movdqa      xmm12,[rsp]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm0
-  psraw       xmm1,3
-  movdqa      xmm0,xmm8
-  pand        xmm3,xmm1
-  paddw       xmm2,xmm7
-  movdqa      xmm1,xmm13
-  psraw       xmm2,2
-  pandn       xmm0,xmm2
-  por         xmm3,xmm0
-  movdqa      xmm2,[rsp+50h]
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm3
-  paddw       xmm2,xmm11
-  movdqa      xmm3,xmm15
-  por         xmm1,xmm0
-  pand        xmm3,xmm14
-  movdqa      xmm14,[rsp+10h]
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+30h]
-  packuswb    xmm4,xmm1
-  movdqa      xmm1,xmm8
-  por         xmm3,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm14
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm8
-  pandn       xmm0,xmm11
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm11
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm15
-  por         xmm2,xmm0
-  packuswb    xmm3,xmm2
-  movdqa      xmm0,[rsp+100h]
-  por         xmm0,[rsp+120h]
-  pand        xmm1,xmm0
-  movdqa      xmm2,[rcx+rbp]
-  movdqa      xmm7,[rsp+50h]
-  pandn       xmm15,[rsp+60h]
-  lea         r11,[rsp+1D8h]
-  pxor        xmm0,xmm0
-  por         xmm1,xmm15
-  movaps      xmm15,[r11-0A8h]
-  movdqa      [rdi],xmm9
-  movaps      xmm9,[r11-48h]
-  punpckhbw   xmm2,xmm0
-  psllw       xmm2,1
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm7
-  movdqa      [rbx],xmm6
-  movaps      xmm6,[r11-18h]
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm11
-  movaps      xmm11,[r11-68h]
-  paddw       xmm2,xmm12
-  movaps      xmm12,[r11-78h]
-  paddw       xmm2,xmm14
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  movaps      xmm10,[r11-58h]
-  movaps      xmm14,[r11-98h]
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm8
-  pandn       xmm8,xmm7
-  pandn       xmm13,xmm7
-  por         xmm2,xmm8
-  movaps      xmm7,[r11-28h]
-  movaps      xmm8,[r11-38h]
-  movdqa      [r8],xmm5
-  pand        xmm0,xmm2
-  por         xmm0,xmm13
-  packuswb    xmm1,xmm0
-  movaps      xmm13,[r11-88h]
-  movdqa      [rbp],xmm4
-  movdqa      [r10+rbp],xmm3
-  movdqa      [rsi+rbp],xmm1
-  mov         rsp,r11
-  pop         rdi
-  pop         rsi
-  pop         rbp
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        rsi
+    push        rdi
+    sub         rsp,1D8h
+    movaps      [rax-38h],xmm6
+    movaps      [rax-48h],xmm7
+    movaps      [rax-58h],xmm8
+    pxor        xmm1,xmm1
+    movsxd      r10,edx
+    mov         rbp,rcx
+    mov         r11d,r8d
+    mov         rdx,rcx
+    mov         rdi,rbp
+    mov         rbx,rbp
+    movdqa      xmm5,[rbp]
+    movaps      [rax-68h],xmm9
+    movaps      [rax-78h],xmm10
+    punpcklbw   xmm5,xmm1
+    movaps      [rax-88h],xmm11
+    movaps      [rax-98h],xmm12
+    movaps      [rax-0A8h],xmm13
+    movaps      [rax-0B8h],xmm14
+    movdqa      xmm14,[r10+rbp]
+    movaps      [rax-0C8h],xmm15
+    lea         eax,[r10*4]
+    movsxd      r8,eax
+    lea         eax,[r10+r10*2]
+    movsxd      rcx,eax
+    lea         eax,[r10+r10]
+    sub         rdx,r8
+    punpcklbw   xmm14,xmm1
+    movdqa      [rsp+90h],xmm5
+    movdqa      [rsp+30h],xmm14
+    movsxd      rsi,eax
+    movsx       eax,r11w
+    sub         rdi,rcx
+    sub         rbx,rsi
+    mov         r8,rbp
+    sub         r8,r10
+    movd        xmm0,eax
+    movsx       eax,r9w
+    movdqa      xmm12,[rdi]
+    movdqa      xmm6, [rsi+rbp]
+    movdqa      xmm13,[rbx]
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm6,xmm1
+    movdqa      xmm8,[r8]
+    movd        xmm0,eax
+    movdqa      xmm10,xmm11
+    mov         eax,2
+    punpcklbw   xmm8,xmm1
+    punpcklbw   xmm12,xmm1
+    cwde
+    punpcklwd   xmm0,xmm0
+    psraw       xmm10,2
+    movdqa      xmm1,xmm8
+    movdqa      [rsp+0F0h],xmm13
+    movdqa      [rsp+0B0h],xmm8
+    pshufd      xmm7,xmm0,0
+    psubw       xmm1,xmm13
+    movdqa      xmm0,xmm5
+    movdqa      xmm4,xmm7
+    movdqa      xmm2,xmm7
+    psubw       xmm0,xmm8
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm5
+    movdqa      [rsp+40h],xmm7
+    movdqa      [rsp+60h],xmm6
+    pcmpgtw     xmm4,xmm0
+    psubw       xmm1,xmm14
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm4,xmm2
+    movdqa      xmm0,xmm11
+    pcmpgtw     xmm0,xmm3
+    pand        xmm4,xmm0
+    movd        xmm0,eax
+    movdqa      [rsp+20h],xmm4
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm2,xmm0,0
+    paddw       xmm10,xmm2
+    movdqa      [rsp+0A0h],xmm2
+    movdqa      xmm15,xmm7
+    pxor        xmm4,xmm4
+    movdqa      xmm0,xmm8
+    psubw       xmm0,xmm12
+    mov         eax,4
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm10
+    cwde
+    pcmpgtw     xmm15,xmm0
+    pcmpgtw     xmm1,xmm3
+    movdqa      xmm3,xmm7
+    movdqa      xmm7,[rdx]
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm6
+    pand        xmm15,xmm1
+    punpcklbw   xmm7,xmm4
+    movdqa      xmm9,xmm15
+    pabsw       xmm0,xmm0
+    psllw       xmm7,1
+    pandn       xmm9,xmm12
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm7,xmm12
+    movd        xmm0,eax
+    pand        xmm3,xmm1
+    paddw       xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    paddw       xmm7,xmm12
+    pshufd      xmm1,xmm0,0
+    paddw       xmm7,xmm13
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm6
+    paddw       xmm7,xmm8
+    movdqa      [rsp+70h],xmm1
+    paddw       xmm7,xmm5
+    movdqa      [rsp+120h],xmm0
+    movdqa      xmm0,[rcx+rbp]
+    punpcklbw   xmm0,xmm4
+    paddw       xmm7,xmm1
+    movdqa      xmm4,xmm15
+    psllw       xmm0,1
+    psraw       xmm7,3
+    paddw       xmm0,xmm6
+    pand        xmm7,xmm15
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm14
+    movdqa      xmm6,xmm15
+    paddw       xmm0,xmm5
+    pandn       xmm6,xmm13
+    paddw       xmm0,xmm8
+    paddw       xmm0,xmm1
+    psraw       xmm0,3
+    movdqa      xmm1,xmm12
+    paddw       xmm1,xmm13
+    pand        xmm0,xmm3
+    movdqa      [rsp+100h],xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,xmm5
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm3
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pandn       xmm0,xmm14
+    pand        xmm4,xmm1
+    movdqa      [rsp+0E0h],xmm0
+    movdqa      xmm0,xmm5
+    paddw       xmm0,xmm8
+    movdqa      xmm1,[rsp+60h]
+    paddw       xmm1,xmm14
+    movdqa      xmm14,xmm3
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,[rsp+30h]
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pand        xmm14,xmm1
+    movdqa      xmm1,xmm13
+    paddw       xmm1,xmm13
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    movdqa      xmm0,[rsp+30h]
+    movdqa      xmm2,xmm13
+    movdqa      xmm5,xmm15
+    paddw       xmm0,[rsp+70h]
+    pandn       xmm5,xmm1
+    paddw       xmm2,xmm8
+    movdqa      xmm8,[rsp+90h]
+    movdqa      xmm1,xmm12
+    paddw       xmm2,xmm8
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,xmm8
+    movdqa      xmm8,xmm3
+    movdqa      xmm2,[rsp+30h]
+    paddw       xmm0,xmm13
+    psraw       xmm1,3
+    pand        xmm15,xmm1
+    movdqa      xmm1,xmm2
+    paddw       xmm1,xmm2
+    paddw       xmm2,[rsp+90h]
+    paddw       xmm2,[rsp+0B0h]
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    movdqa      xmm13,[r8]
+    paddw       xmm0, [rsp+70h]
+    paddw       xmm1, [rsp+0A0h]
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    psraw       xmm1,2
+    movdqa      xmm0, [rdi]
+    pandn       xmm8,xmm1
+    movdqa      xmm1, [rsp+60h]
+    paddw       xmm1,xmm2
+    movdqa      xmm2, [rbx]
+    psraw       xmm1,3
+    pand        xmm3,xmm1
+    movdqa      xmm1, [rbp]
+    movdqa      [rsp+0D0h],xmm3
+    pxor        xmm3,xmm3
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm1,xmm3
+    punpckhbw   xmm13,xmm3
+    movdqa      [rsp+0C0h],xmm0
+    movdqa      xmm0,[r10+rbp]
+    movdqa      [rsp],xmm1
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm2,xmm3
+    movdqa      [rsp+80h],xmm0
+    movdqa      xmm0,[rsi+rbp]
+    movdqa      [rsp+10h],xmm13
+    punpckhbw   xmm0,xmm3
+    movdqa      [rsp+50h],xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm1,xmm13
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm2
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,[rsp]
+    movdqa      xmm13,[rsp+40h]
+    movdqa      [rsp+110h],xmm2
+    psubw       xmm1, [rsp+80h]
+    pcmpgtw     xmm13,xmm0
+    pcmpgtw     xmm11,xmm3
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm10,xmm3
+    movdqa      xmm1, [rsp+40h]
+    movdqa      xmm2,xmm1
+    movdqa      xmm3,xmm1
+    pcmpgtw     xmm2,xmm0
+    movdqa      xmm0, [rsp+10h]
+    pand        xmm13,xmm2
+    pand        xmm13,xmm11
+    movdqa      xmm11,[rsp+0C0h]
+    psubw       xmm0,xmm11
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm3,xmm0
+    pand        xmm3,xmm10
+    movdqa      xmm0,[rsp]
+    psubw       xmm0,[rsp+50h]
+    movdqa      xmm2,[rdx]
+    pabsw       xmm0,xmm0
+    por         xmm7,xmm9
+    movdqa      xmm9,[rsp+20h]
+    pcmpgtw     xmm1,xmm0
+    pand        xmm9,xmm7
+    movdqa      xmm7,[rsp+20h]
+    movdqa      xmm0,xmm7
+    pandn       xmm0,xmm12
+    movdqa      xmm12,[rsp+110h]
+    pand        xmm1,xmm10
+    movdqa      xmm10,[rsp+70h]
+    movdqa      [rsp+40h],xmm1
+    movdqa      xmm1,xmm13
+    por         xmm9,xmm0
+    pxor        xmm0,xmm0
+    por         xmm4,xmm6
+    movdqa      xmm6,xmm7
+    punpckhbw   xmm2,xmm0
+    por         xmm15,xmm5
+    movdqa      xmm5,[rsp+20h]
+    movdqa      xmm0,xmm3
+    psllw       xmm2,1
+    pandn       xmm0,xmm11
+    pand        xmm6,xmm4
+    movdqa      xmm4,[rsp]
+    paddw       xmm2,xmm11
+    pand        xmm5,xmm15
+    movdqa      xmm15,[rsp+20h]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm12
+    paddw       xmm2,[rsp+10h]
+    paddw       xmm2,[rsp]
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    pand        xmm2,xmm3
+    por         xmm2,xmm0
+    pand        xmm1,xmm2
+    movdqa      xmm0,xmm13
+    movdqa      xmm2,xmm11
+    pandn       xmm0,xmm11
+    paddw       xmm2,xmm12
+    por         xmm1,xmm0
+    packuswb    xmm9,xmm1
+    movdqa      xmm0,xmm7
+    movdqa      xmm7,[rsp+0A0h]
+    pandn       xmm0,[rsp+0F0h]
+    movdqa      xmm1,xmm3
+    por         xmm6,xmm0
+    movdqa      xmm0,[rsp+10h]
+    paddw       xmm0,xmm4
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm12
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    pandn       xmm0,xmm12
+    movdqa      xmm1,xmm12
+    paddw       xmm1,[rsp+10h]
+    por         xmm2,xmm0
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+0B0h]
+    paddw       xmm1,xmm4
+    packuswb    xmm6,xmm2
+    movdqa      xmm2,xmm3
+    psllw       xmm1,1
+    por         xmm5,xmm0
+    movdqa      xmm0,[rsp+80h]
+    paddw       xmm0,xmm10
+    paddw       xmm1,xmm0
+    paddw       xmm11,xmm1
+    psraw       xmm11,3
+    movdqa      xmm1,xmm12
+    pand        xmm2,xmm11
+    paddw       xmm1,xmm12
+    movdqa      xmm11,[rsp+80h]
+    movdqa      xmm0, [rsp+10h]
+    por         xmm14,[rsp+0E0h]
+    paddw       xmm0,xmm11
+    movdqa      xmm4,xmm15
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    paddw       xmm1,xmm7
+    psraw       xmm1,2
+    pandn       xmm3,xmm1
+    por         xmm2,xmm3
+    movdqa      xmm1,xmm13
+    movdqa      xmm3,[rsp+10h]
+    pandn       xmm0,xmm3
+    pand        xmm1,xmm2
+    movdqa      xmm2,xmm11
+    paddw       xmm2,[rsp]
+    por         xmm1,xmm0
+    movdqa      xmm0,[rsp+0D0h]
+    por         xmm0,xmm8
+    paddw       xmm2,xmm3
+    packuswb    xmm5,xmm1
+    movdqa      xmm8,[rsp+40h]
+    movdqa      xmm1,[rsp+50h]
+    movdqa      xmm3,xmm8
+    pand        xmm4,xmm0
+    psllw       xmm2,1
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+90h]
+    por         xmm4,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm10
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,[rsp]
+    movdqa      xmm2,xmm11
+    paddw       xmm0,xmm12
+    movdqa      xmm12,[rsp]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm0
+    psraw       xmm1,3
+    movdqa      xmm0,xmm8
+    pand        xmm3,xmm1
+    paddw       xmm2,xmm7
+    movdqa      xmm1,xmm13
+    psraw       xmm2,2
+    pandn       xmm0,xmm2
+    por         xmm3,xmm0
+    movdqa      xmm2,[rsp+50h]
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm3
+    paddw       xmm2,xmm11
+    movdqa      xmm3,xmm15
+    por         xmm1,xmm0
+    pand        xmm3,xmm14
+    movdqa      xmm14,[rsp+10h]
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+30h]
+    packuswb    xmm4,xmm1
+    movdqa      xmm1,xmm8
+    por         xmm3,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm14
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm8
+    pandn       xmm0,xmm11
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm11
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm15
+    por         xmm2,xmm0
+    packuswb    xmm3,xmm2
+    movdqa      xmm0,[rsp+100h]
+    por         xmm0,[rsp+120h]
+    pand        xmm1,xmm0
+    movdqa      xmm2,[rcx+rbp]
+    movdqa      xmm7,[rsp+50h]
+    pandn       xmm15,[rsp+60h]
+    lea         r11,[rsp+1D8h]
+    pxor        xmm0,xmm0
+    por         xmm1,xmm15
+    movaps      xmm15,[r11-0A8h]
+    movdqa      [rdi],xmm9
+    movaps      xmm9,[r11-48h]
+    punpckhbw   xmm2,xmm0
+    psllw       xmm2,1
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm7
+    movdqa      [rbx],xmm6
+    movaps      xmm6,[r11-18h]
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm11
+    movaps      xmm11,[r11-68h]
+    paddw       xmm2,xmm12
+    movaps      xmm12,[r11-78h]
+    paddw       xmm2,xmm14
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    movaps      xmm10,[r11-58h]
+    movaps      xmm14,[r11-98h]
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm8
+    pandn       xmm8,xmm7
+    pandn       xmm13,xmm7
+    por         xmm2,xmm8
+    movaps      xmm7,[r11-28h]
+    movaps      xmm8,[r11-38h]
+    movdqa      [r8],xmm5
+    pand        xmm0,xmm2
+    por         xmm0,xmm13
+    packuswb    xmm1,xmm0
+    movaps      xmm13,[r11-88h]
+    movdqa      [rbp],xmm4
+    movdqa      [r10+rbp],xmm3
+    movdqa      [rsi+rbp],xmm1
+    mov         rsp,r11
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+    ret
 
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rdi
-  PUSH_XMM 16
-  sub         rsp,0C8h
-  mov         r10,qword [rax + 30h]  ; pTC
-  pxor        xmm1,xmm1
-  mov         rbx,rcx
-  movsxd      r11,r8d
-  movsx       ecx,byte [r10]
-  movsx       r8d,byte [r10+2]
-  mov         rdi,rdx
-  movq        xmm2,[rbx]
-  movq        xmm9,[r11+rbx]
-  movsx       edx,byte [r10+1]
-  mov         word [rsp+2],cx
-  mov         word [rsp],cx
-  movsx       eax,byte [r10+3]
-  mov         word [rsp+6],dx
-  mov         word [rsp+4],dx
-  movdqa      xmm11,xmm1
-  mov         word [rsp+0Eh],ax
-  mov         word [rsp+0Ch],ax
-  lea         eax,[r11+r11]
-  movsxd      rcx,eax
-  mov         rax,rbx
-  mov         rdx,rdi
-  sub         rax,rcx
-  mov         word [rsp+0Ah],r8w
-  mov         word [rsp+8],r8w
-  movdqa      xmm6,[rsp]
-  movdqa      xmm7,xmm6
-  movq        xmm13, [rax]
-  mov         rax,rdi
-  sub         rax,rcx
-  mov         rcx,rbx
-  pcmpgtw     xmm7,xmm1
-  psubw       xmm11,xmm6
-  sub         rcx,r11
-  sub         rdx,r11
-  movq        xmm0,[rax]
-  movsx       eax,r9w
-  movq        xmm15,[rcx]
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rdx]
-  movdqa      xmm4,xmm13
-  punpcklqdq  xmm15,xmm0
-  movq        xmm0, [rdi]
-  punpcklbw   xmm4,xmm1
-  movdqa      xmm12,xmm15
-  punpcklqdq  xmm2,xmm0
-  movq        xmm0, [r11+rdi]
-  punpcklbw   xmm12,xmm1
-  movdqa      xmm14,xmm2
-  punpcklqdq  xmm9,xmm0
-  punpckhbw   xmm2,xmm1
-  punpcklbw   xmm14,xmm1
-  movd        xmm0,eax
-  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
-  punpckhbw   xmm13,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm9
-  movdqa      [rsp+10h],xmm2
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm9,xmm1
-  punpcklbw   xmm3,xmm1
-  movdqa      xmm1,xmm14
-  pshufd      xmm10,xmm0,0
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm8,xmm0,0
-  movd        xmm0,eax
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  psubw       xmm1,xmm12
-  movdqa      xmm2,xmm10
-  lea         r11,[rsp+0C8h]
-  psllw       xmm1,2
-  movdqa      xmm0,xmm4
-  psubw       xmm4,xmm12
-  psubw       xmm0,xmm3
-  psubw       xmm3,xmm14
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm11
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm12
-  psubw       xmm0,xmm14
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  movdqa      xmm3,[rsp]
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm9
-  psubw       xmm13,xmm15
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  paddw       xmm12,xmm6
-  psubw       xmm14,xmm6
-  movdqa      xmm2,[rsp+10h]
-  movaps      xmm6,[r11-18h]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm15
-  psubw       xmm9,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm15
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  pmaxsw      xmm11,xmm1
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm10,xmm0
-  pabsw       xmm0,xmm13
-  pminsw      xmm3,xmm11
-  movaps      xmm11,[r11-68h]
-  movaps      xmm13,[rsp+40h]
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm9
-  movaps      xmm9, [r11-48h]
-  pand        xmm10,xmm1
-  pcmpgtw     xmm8,xmm0
-  pand        xmm10,xmm8
-  pand        xmm10,xmm7
-  movaps      xmm8,[r11-38h]
-  movaps      xmm7,[r11-28h]
-  pand        xmm3,xmm10
-  paddw       xmm15,xmm3
-  psubw       xmm2,xmm3
-  movaps      xmm10,[r11-58h]
-  packuswb    xmm12,xmm15
-  movaps      xmm15,[rsp+20h]
-  packuswb    xmm14,xmm2
-  movq        [rcx],xmm12
-  movq        [rbx],xmm14
-  psrldq      xmm12,8
-  psrldq      xmm14,8
-  movq        [rdx],xmm12
-  movaps      xmm12,[r11-78h]
-  movq        [rdi],xmm14
-  movaps      xmm14,[rsp+30h]
-  mov         rsp,r11
-  POP_XMM
-  pop         rdi
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rdi
+    PUSH_XMM 16
+    sub         rsp,0C8h
+    mov         r10,qword [rax + 30h]  ; pTC
+    pxor        xmm1,xmm1
+    mov         rbx,rcx
+    movsxd      r11,r8d
+    movsx       ecx,byte [r10]
+    movsx       r8d,byte [r10+2]
+    mov         rdi,rdx
+    movq        xmm2,[rbx]
+    movq        xmm9,[r11+rbx]
+    movsx       edx,byte [r10+1]
+    mov         word [rsp+2],cx
+    mov         word [rsp],cx
+    movsx       eax,byte [r10+3]
+    mov         word [rsp+6],dx
+    mov         word [rsp+4],dx
+    movdqa      xmm11,xmm1
+    mov         word [rsp+0Eh],ax
+    mov         word [rsp+0Ch],ax
+    lea         eax,[r11+r11]
+    movsxd      rcx,eax
+    mov         rax,rbx
+    mov         rdx,rdi
+    sub         rax,rcx
+    mov         word [rsp+0Ah],r8w
+    mov         word [rsp+8],r8w
+    movdqa      xmm6,[rsp]
+    movdqa      xmm7,xmm6
+    movq        xmm13, [rax]
+    mov         rax,rdi
+    sub         rax,rcx
+    mov         rcx,rbx
+    pcmpgtw     xmm7,xmm1
+    psubw       xmm11,xmm6
+    sub         rcx,r11
+    sub         rdx,r11
+    movq        xmm0,[rax]
+    movsx       eax,r9w
+    movq        xmm15,[rcx]
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rdx]
+    movdqa      xmm4,xmm13
+    punpcklqdq  xmm15,xmm0
+    movq        xmm0, [rdi]
+    punpcklbw   xmm4,xmm1
+    movdqa      xmm12,xmm15
+    punpcklqdq  xmm2,xmm0
+    movq        xmm0, [r11+rdi]
+    punpcklbw   xmm12,xmm1
+    movdqa      xmm14,xmm2
+    punpcklqdq  xmm9,xmm0
+    punpckhbw   xmm2,xmm1
+    punpcklbw   xmm14,xmm1
+    movd        xmm0,eax
+    movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
+    punpckhbw   xmm13,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm9
+    movdqa      [rsp+10h],xmm2
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm9,xmm1
+    punpcklbw   xmm3,xmm1
+    movdqa      xmm1,xmm14
+    pshufd      xmm10,xmm0,0
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm8,xmm0,0
+    movd        xmm0,eax
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    psubw       xmm1,xmm12
+    movdqa      xmm2,xmm10
+    lea         r11,[rsp+0C8h]
+    psllw       xmm1,2
+    movdqa      xmm0,xmm4
+    psubw       xmm4,xmm12
+    psubw       xmm0,xmm3
+    psubw       xmm3,xmm14
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm11
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm12
+    psubw       xmm0,xmm14
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    movdqa      xmm3,[rsp]
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm9
+    psubw       xmm13,xmm15
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    paddw       xmm12,xmm6
+    psubw       xmm14,xmm6
+    movdqa      xmm2,[rsp+10h]
+    movaps      xmm6,[r11-18h]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm15
+    psubw       xmm9,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm15
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    pmaxsw      xmm11,xmm1
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm10,xmm0
+    pabsw       xmm0,xmm13
+    pminsw      xmm3,xmm11
+    movaps      xmm11,[r11-68h]
+    movaps      xmm13,[rsp+40h]
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm9
+    movaps      xmm9, [r11-48h]
+    pand        xmm10,xmm1
+    pcmpgtw     xmm8,xmm0
+    pand        xmm10,xmm8
+    pand        xmm10,xmm7
+    movaps      xmm8,[r11-38h]
+    movaps      xmm7,[r11-28h]
+    pand        xmm3,xmm10
+    paddw       xmm15,xmm3
+    psubw       xmm2,xmm3
+    movaps      xmm10,[r11-58h]
+    packuswb    xmm12,xmm15
+    movaps      xmm15,[rsp+20h]
+    packuswb    xmm14,xmm2
+    movq        [rcx],xmm12
+    movq        [rbx],xmm14
+    psrldq      xmm12,8
+    psrldq      xmm14,8
+    movq        [rdx],xmm12
+    movaps      xmm12,[r11-78h]
+    movq        [rdi],xmm14
+    movaps      xmm14,[rsp+30h]
+    mov         rsp,r11
+    POP_XMM
+    pop         rdi
+    pop         rbx
+    ret
 
 
 WELS_EXTERN   DeblockChromaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  PUSH_XMM 15
-  sub         rsp,90h
-  pxor        xmm1,xmm1
-  mov         r11,rcx
-  mov         rbx,rdx
-  mov         r10d,r9d
-  movq        xmm13,[r11]
-  lea         eax,[r8+r8]
-  movsxd      r9,eax
-  mov         rax,rcx
-  sub         rax,r9
-  movq        xmm14,[rax]
-  mov         rax,rdx
-  sub         rax,r9
-  movq        xmm0,[rax]
-  movsxd      rax,r8d
-  sub         rcx,rax
-  sub         rdx,rax
-  movq        xmm12,[rax+r11]
-  movq        xmm10,[rcx]
-  punpcklqdq  xmm14,xmm0
-  movdqa      xmm8,xmm14
-  movq        xmm0,[rdx]
-  punpcklbw   xmm8,xmm1
-  punpckhbw   xmm14,xmm1
-  punpcklqdq  xmm10,xmm0
-  movq        xmm0,[rbx]
-  movdqa      xmm5,xmm10
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rax+rbx]
-  punpcklbw   xmm5,xmm1
-  movsx       eax,r10w
-  movdqa      xmm9,xmm13
-  punpcklqdq  xmm12,xmm0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm10,xmm1
-  movd        xmm0,eax
-  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
-  punpckhbw   xmm13,xmm1
-  movdqa      xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm12,xmm1
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm7,xmm1
-  movd        xmm0,eax
-  movdqa      xmm1,xmm8
-  psubw       xmm1,xmm5
-  punpcklwd   xmm0,xmm0
-  movdqa      xmm6,xmm11
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm9
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm10
-  movdqa      xmm1,xmm14
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm10
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm11,xmm0
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm11,xmm2
-  movdqa      xmm0,xmm12
-  movdqa      xmm4,xmm6
-  movdqa      xmm1,xmm8
-  mov         eax,2
-  cwde
-  paddw       xmm1,xmm8
-  psubw       xmm0,xmm13
-  paddw       xmm1,xmm5
-  pabsw       xmm0,xmm0
-  movdqa      xmm2,xmm14
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm14
-  movd        xmm0,eax
-  pand        xmm11,xmm3
-  paddw       xmm7,xmm7
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  paddw       xmm2,xmm12
-  paddw       xmm12,xmm12
-  pshufd      xmm3,xmm0,0
-  paddw       xmm7,xmm9
-  paddw       xmm12,xmm13
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm5
-  paddw       xmm7,xmm8
-  psraw       xmm1,2
-  paddw       xmm12,xmm14
-  paddw       xmm7,xmm3
-  movaps      xmm14,[rsp]
-  pand        xmm4,xmm1
-  paddw       xmm12,xmm3
-  psraw       xmm7,2
-  movdqa      xmm1,xmm11
-  por         xmm4,xmm0
-  psraw       xmm12,2
-  paddw       xmm2,xmm3
-  movdqa      xmm0,xmm11
-  pandn       xmm0,xmm10
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  packuswb    xmm4,xmm1
-  movdqa      xmm0,xmm11
-  movdqa      xmm1,xmm6
-  pand        xmm1,xmm7
-  movaps      xmm7,[rsp+70h]
-  movq        [rcx],xmm4
-  pandn       xmm6,xmm9
-  pandn       xmm11,xmm13
-  pand        xmm0,xmm12
-  por         xmm1,xmm6
-  por         xmm0,xmm11
-  psrldq      xmm4,8
-  packuswb    xmm1,xmm0
-  movq        [r11],xmm1
-  psrldq      xmm1,8
-  movq        [rdx],xmm4
-  lea         r11,[rsp+90h]
-  movaps      xmm6,[r11-10h]
-  movaps      xmm8,[r11-30h]
-  movaps      xmm9,[r11-40h]
-  movq        [rbx],xmm1
-  movaps      xmm10,[r11-50h]
-  movaps      xmm11,[r11-60h]
-  movaps      xmm12,[r11-70h]
-  movaps      xmm13,[r11-80h]
-  mov         rsp,r11
-  POP_XMM
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    PUSH_XMM 15
+    sub         rsp,90h
+    pxor        xmm1,xmm1
+    mov         r11,rcx
+    mov         rbx,rdx
+    mov         r10d,r9d
+    movq        xmm13,[r11]
+    lea         eax,[r8+r8]
+    movsxd      r9,eax
+    mov         rax,rcx
+    sub         rax,r9
+    movq        xmm14,[rax]
+    mov         rax,rdx
+    sub         rax,r9
+    movq        xmm0,[rax]
+    movsxd      rax,r8d
+    sub         rcx,rax
+    sub         rdx,rax
+    movq        xmm12,[rax+r11]
+    movq        xmm10,[rcx]
+    punpcklqdq  xmm14,xmm0
+    movdqa      xmm8,xmm14
+    movq        xmm0,[rdx]
+    punpcklbw   xmm8,xmm1
+    punpckhbw   xmm14,xmm1
+    punpcklqdq  xmm10,xmm0
+    movq        xmm0,[rbx]
+    movdqa      xmm5,xmm10
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rax+rbx]
+    punpcklbw   xmm5,xmm1
+    movsx       eax,r10w
+    movdqa      xmm9,xmm13
+    punpcklqdq  xmm12,xmm0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm10,xmm1
+    movd        xmm0,eax
+    movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
+    punpckhbw   xmm13,xmm1
+    movdqa      xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm12,xmm1
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm7,xmm1
+    movd        xmm0,eax
+    movdqa      xmm1,xmm8
+    psubw       xmm1,xmm5
+    punpcklwd   xmm0,xmm0
+    movdqa      xmm6,xmm11
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm9
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm10
+    movdqa      xmm1,xmm14
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm10
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm11,xmm0
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm11,xmm2
+    movdqa      xmm0,xmm12
+    movdqa      xmm4,xmm6
+    movdqa      xmm1,xmm8
+    mov         eax,2
+    cwde
+    paddw       xmm1,xmm8
+    psubw       xmm0,xmm13
+    paddw       xmm1,xmm5
+    pabsw       xmm0,xmm0
+    movdqa      xmm2,xmm14
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm14
+    movd        xmm0,eax
+    pand        xmm11,xmm3
+    paddw       xmm7,xmm7
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    paddw       xmm2,xmm12
+    paddw       xmm12,xmm12
+    pshufd      xmm3,xmm0,0
+    paddw       xmm7,xmm9
+    paddw       xmm12,xmm13
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm5
+    paddw       xmm7,xmm8
+    psraw       xmm1,2
+    paddw       xmm12,xmm14
+    paddw       xmm7,xmm3
+    movaps      xmm14,[rsp]
+    pand        xmm4,xmm1
+    paddw       xmm12,xmm3
+    psraw       xmm7,2
+    movdqa      xmm1,xmm11
+    por         xmm4,xmm0
+    psraw       xmm12,2
+    paddw       xmm2,xmm3
+    movdqa      xmm0,xmm11
+    pandn       xmm0,xmm10
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    packuswb    xmm4,xmm1
+    movdqa      xmm0,xmm11
+    movdqa      xmm1,xmm6
+    pand        xmm1,xmm7
+    movaps      xmm7,[rsp+70h]
+    movq        [rcx],xmm4
+    pandn       xmm6,xmm9
+    pandn       xmm11,xmm13
+    pand        xmm0,xmm12
+    por         xmm1,xmm6
+    por         xmm0,xmm11
+    psrldq      xmm4,8
+    packuswb    xmm1,xmm0
+    movq        [r11],xmm1
+    psrldq      xmm1,8
+    movq        [rdx],xmm4
+    lea         r11,[rsp+90h]
+    movaps      xmm6,[r11-10h]
+    movaps      xmm8,[r11-30h]
+    movaps      xmm9,[r11-40h]
+    movq        [rbx],xmm1
+    movaps      xmm10,[r11-50h]
+    movaps      xmm11,[r11-60h]
+    movaps      xmm12,[r11-70h]
+    movaps      xmm13,[r11-80h]
+    mov         rsp,r11
+    POP_XMM
+    pop         rbx
+    ret
 
 
 
@@ -1089,548 +1089,548 @@
 
 
 WELS_EXTERN   DeblockChromaEq4H_ssse3
-  mov         rax,rsp
-  mov         [rax+20h],rbx
-  push        rdi
-  PUSH_XMM 16
-  sub         rsp,140h
-  mov         rdi,rdx
-  lea         eax,[r8*4]
-  movsxd      r10,eax
-  mov         eax,[rcx-2]
-  mov         [rsp+10h],eax
-  lea         rbx,[r10+rdx-2]
-  lea         r11,[r10+rcx-2]
-  movdqa      xmm5,[rsp+10h]
-  movsxd      r10,r8d
-  mov         eax,[r10+rcx-2]
-  lea         rdx,[r10+r10*2]
-  mov         [rsp+20h],eax
-  mov         eax,[rcx+r10*2-2]
-  mov         [rsp+30h],eax
-  mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h]
-  mov         [rsp+40h],eax
-  mov         eax, [rdi-2]
-  movdqa      xmm4,[rsp+30h]
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rdi-2]
-  movdqa      xmm3,[rsp+40h]
-  mov         [rsp+60h],eax
-  mov         eax,[rdi+r10*2-2]
-  punpckldq   xmm5,[rsp+50h]
-  mov         [rsp+70h],eax
-  mov         eax, [rdx+rdi-2]
-  punpckldq   xmm2, [rsp+60h]
-  mov          [rsp+80h],eax
-  mov         eax,[r11]
-  punpckldq   xmm4, [rsp+70h]
-  mov         [rsp+50h],eax
-  mov         eax,[rbx]
-  punpckldq   xmm3,[rsp+80h]
-  mov         [rsp+60h],eax
-  mov         eax,[r10+r11]
-  movdqa      xmm0, [rsp+50h]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm0,[rsp+50h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+60h],eax
-  mov         eax,[r11+r10*2]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[rbx+r10*2]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  mov         eax, [rdx+r11]
-  movdqa      xmm15,xmm1
-  punpckldq   xmm0,[rsp+60h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax, [rdx+rbx]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm15,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm12,xmm15
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm12,xmm0
-  punpckhdq   xmm15,xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm11,xmm12
-  punpckldq   xmm0,xmm5
-  punpckhdq   xmm1,xmm5
-  punpcklqdq  xmm11,xmm0
-  punpckhqdq  xmm12,xmm0
-  movsx       eax,r9w
-  movdqa      xmm14,xmm15
-  punpcklqdq  xmm14,xmm1
-  punpckhqdq  xmm15,xmm1
-  pxor        xmm1,xmm1
-  movd        xmm0,eax
-  movdqa      xmm4,xmm12
-  movdqa      xmm8,xmm11
-  movsx       eax,word [rsp+170h + 160] ; iBeta
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm4,xmm1
-  punpckhbw   xmm12,xmm1
-  movdqa      xmm9,xmm14
-  movdqa      xmm7,xmm15
-  movdqa      xmm10,xmm15
-  pshufd      xmm13,xmm0,0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm14,xmm1
-  movdqa      xmm6,xmm13
-  movd        xmm0,eax
-  movdqa      [rsp],xmm11
-  mov         eax,2
-  cwde
-  punpckhbw   xmm11,xmm1
-  punpckhbw   xmm10,xmm1
-  punpcklbw   xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm8,xmm1
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm4
-  psubw       xmm0,xmm9
-  psubw       xmm1,xmm4
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm12
-  movdqa      xmm1,xmm11
-  psubw       xmm0,xmm14
-  psubw       xmm1,xmm12
-  movdqa      xmm5,xmm6
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm13,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm2,xmm0
-  paddw       xmm1,xmm8
-  movdqa      xmm0,xmm10
-  pand        xmm13,xmm2
-  psubw       xmm0,xmm14
-  paddw       xmm1,xmm4
-  movdqa      xmm2,xmm11
-  pabsw       xmm0,xmm0
-  paddw       xmm2,xmm11
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm12
-  movd        xmm0,eax
-  pand        xmm13,xmm3
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm4
-  paddw       xmm2,xmm3
-  psraw       xmm1,2
-  pand        xmm5,xmm1
-  por         xmm5,xmm0
-  paddw       xmm7,xmm7
-  paddw       xmm10,xmm10
-  psraw       xmm2,2
-  movdqa      xmm1,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm2
-  paddw       xmm7,xmm9
-  por         xmm1,xmm0
-  paddw       xmm10,xmm14
-  paddw       xmm7,xmm8
-  movdqa      xmm0,xmm13
-  packuswb    xmm5,xmm1
-  paddw       xmm7,xmm3
-  paddw       xmm10,xmm11
-  movdqa      xmm1,xmm6
-  paddw       xmm10,xmm3
-  pandn       xmm6,xmm9
-  psraw       xmm7,2
-  pand        xmm1,xmm7
-  psraw       xmm10,2
-  pandn       xmm13,xmm14
-  pand        xmm0,xmm10
-  por         xmm1,xmm6
-  movdqa      xmm6,[rsp]
-  movdqa      xmm4,xmm6
-  por         xmm0,xmm13
-  punpcklbw   xmm4,xmm5
-  punpckhbw   xmm6,xmm5
-  movdqa      xmm3,xmm4
-  packuswb    xmm1,xmm0
-  movdqa      xmm0,xmm1
-  punpckhbw   xmm1,xmm15
-  punpcklbw   xmm0,xmm15
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm6
-  movdqa      xmm2,xmm3
-  punpcklwd   xmm0,xmm1
-  punpckhwd   xmm6,xmm1
-  movdqa      xmm1,xmm4
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm6
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm6
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+10h],xmm0
-  movdqa      [rsp+60h],xmm2
-  movdqa      xmm0,xmm3
-  mov         eax,[rsp+10h]
-  mov         [rcx-2],eax
-  mov         eax,[rsp+60h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [r10+rcx-2],eax
-  movdqa      [rsp+20h],xmm0
-  mov         eax, [rsp+20h]
-  movdqa      [rsp+70h],xmm3
-  mov         [rcx+r10*2-2],eax
-  mov         eax,[rsp+70h]
-  mov         [rdx+rcx-2],eax
-  mov         eax,[rsp+18h]
-  mov         [r11],eax
-  mov         eax,[rsp+68h]
-  mov         [r10+r11],eax
-  mov         eax,[rsp+28h]
-  mov         [r11+r10*2],eax
-  mov         eax,[rsp+78h]
-  mov         [rdx+r11],eax
-  mov         eax,[rsp+14h]
-  mov         [rdi-2],eax
-  mov         eax,[rsp+64h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+24h]
-  mov         [rdi+r10*2-2],eax
-  mov         eax, [rsp+74h]
-  mov         [rdx+rdi-2],eax
-  mov         eax, [rsp+1Ch]
-  mov         [rbx],eax
-  mov         eax, [rsp+6Ch]
-  mov         [r10+rbx],eax
-  mov         eax,[rsp+2Ch]
-  mov         [rbx+r10*2],eax
-  mov         eax,[rsp+7Ch]
-  mov         [rdx+rbx],eax
-  lea         rsp,[rsp+140h]
-  POP_XMM
-  mov         rbx, [rsp+28h]
-  pop         rdi
-  ret
+    mov         rax,rsp
+    mov         [rax+20h],rbx
+    push        rdi
+    PUSH_XMM 16
+    sub         rsp,140h
+    mov         rdi,rdx
+    lea         eax,[r8*4]
+    movsxd      r10,eax
+    mov         eax,[rcx-2]
+    mov         [rsp+10h],eax
+    lea         rbx,[r10+rdx-2]
+    lea         r11,[r10+rcx-2]
+    movdqa      xmm5,[rsp+10h]
+    movsxd      r10,r8d
+    mov         eax,[r10+rcx-2]
+    lea         rdx,[r10+r10*2]
+    mov         [rsp+20h],eax
+    mov         eax,[rcx+r10*2-2]
+    mov         [rsp+30h],eax
+    mov         eax,[rdx+rcx-2]
+    movdqa      xmm2,[rsp+20h]
+    mov         [rsp+40h],eax
+    mov         eax, [rdi-2]
+    movdqa      xmm4,[rsp+30h]
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rdi-2]
+    movdqa      xmm3,[rsp+40h]
+    mov         [rsp+60h],eax
+    mov         eax,[rdi+r10*2-2]
+    punpckldq   xmm5,[rsp+50h]
+    mov         [rsp+70h],eax
+    mov         eax, [rdx+rdi-2]
+    punpckldq   xmm2, [rsp+60h]
+    mov          [rsp+80h],eax
+    mov         eax,[r11]
+    punpckldq   xmm4, [rsp+70h]
+    mov         [rsp+50h],eax
+    mov         eax,[rbx]
+    punpckldq   xmm3,[rsp+80h]
+    mov         [rsp+60h],eax
+    mov         eax,[r10+r11]
+    movdqa      xmm0, [rsp+50h]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm0,[rsp+50h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+60h],eax
+    mov         eax,[r11+r10*2]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[rbx+r10*2]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    mov         eax, [rdx+r11]
+    movdqa      xmm15,xmm1
+    punpckldq   xmm0,[rsp+60h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax, [rdx+rbx]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm15,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm12,xmm15
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm12,xmm0
+    punpckhdq   xmm15,xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm11,xmm12
+    punpckldq   xmm0,xmm5
+    punpckhdq   xmm1,xmm5
+    punpcklqdq  xmm11,xmm0
+    punpckhqdq  xmm12,xmm0
+    movsx       eax,r9w
+    movdqa      xmm14,xmm15
+    punpcklqdq  xmm14,xmm1
+    punpckhqdq  xmm15,xmm1
+    pxor        xmm1,xmm1
+    movd        xmm0,eax
+    movdqa      xmm4,xmm12
+    movdqa      xmm8,xmm11
+    movsx       eax,word [rsp+170h + 160] ; iBeta
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm4,xmm1
+    punpckhbw   xmm12,xmm1
+    movdqa      xmm9,xmm14
+    movdqa      xmm7,xmm15
+    movdqa      xmm10,xmm15
+    pshufd      xmm13,xmm0,0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm14,xmm1
+    movdqa      xmm6,xmm13
+    movd        xmm0,eax
+    movdqa      [rsp],xmm11
+    mov         eax,2
+    cwde
+    punpckhbw   xmm11,xmm1
+    punpckhbw   xmm10,xmm1
+    punpcklbw   xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm8,xmm1
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm4
+    psubw       xmm0,xmm9
+    psubw       xmm1,xmm4
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm12
+    movdqa      xmm1,xmm11
+    psubw       xmm0,xmm14
+    psubw       xmm1,xmm12
+    movdqa      xmm5,xmm6
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm13,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm2,xmm0
+    paddw       xmm1,xmm8
+    movdqa      xmm0,xmm10
+    pand        xmm13,xmm2
+    psubw       xmm0,xmm14
+    paddw       xmm1,xmm4
+    movdqa      xmm2,xmm11
+    pabsw       xmm0,xmm0
+    paddw       xmm2,xmm11
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm12
+    movd        xmm0,eax
+    pand        xmm13,xmm3
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm4
+    paddw       xmm2,xmm3
+    psraw       xmm1,2
+    pand        xmm5,xmm1
+    por         xmm5,xmm0
+    paddw       xmm7,xmm7
+    paddw       xmm10,xmm10
+    psraw       xmm2,2
+    movdqa      xmm1,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm2
+    paddw       xmm7,xmm9
+    por         xmm1,xmm0
+    paddw       xmm10,xmm14
+    paddw       xmm7,xmm8
+    movdqa      xmm0,xmm13
+    packuswb    xmm5,xmm1
+    paddw       xmm7,xmm3
+    paddw       xmm10,xmm11
+    movdqa      xmm1,xmm6
+    paddw       xmm10,xmm3
+    pandn       xmm6,xmm9
+    psraw       xmm7,2
+    pand        xmm1,xmm7
+    psraw       xmm10,2
+    pandn       xmm13,xmm14
+    pand        xmm0,xmm10
+    por         xmm1,xmm6
+    movdqa      xmm6,[rsp]
+    movdqa      xmm4,xmm6
+    por         xmm0,xmm13
+    punpcklbw   xmm4,xmm5
+    punpckhbw   xmm6,xmm5
+    movdqa      xmm3,xmm4
+    packuswb    xmm1,xmm0
+    movdqa      xmm0,xmm1
+    punpckhbw   xmm1,xmm15
+    punpcklbw   xmm0,xmm15
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm6
+    movdqa      xmm2,xmm3
+    punpcklwd   xmm0,xmm1
+    punpckhwd   xmm6,xmm1
+    movdqa      xmm1,xmm4
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm6
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm6
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+10h],xmm0
+    movdqa      [rsp+60h],xmm2
+    movdqa      xmm0,xmm3
+    mov         eax,[rsp+10h]
+    mov         [rcx-2],eax
+    mov         eax,[rsp+60h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [r10+rcx-2],eax
+    movdqa      [rsp+20h],xmm0
+    mov         eax, [rsp+20h]
+    movdqa      [rsp+70h],xmm3
+    mov         [rcx+r10*2-2],eax
+    mov         eax,[rsp+70h]
+    mov         [rdx+rcx-2],eax
+    mov         eax,[rsp+18h]
+    mov         [r11],eax
+    mov         eax,[rsp+68h]
+    mov         [r10+r11],eax
+    mov         eax,[rsp+28h]
+    mov         [r11+r10*2],eax
+    mov         eax,[rsp+78h]
+    mov         [rdx+r11],eax
+    mov         eax,[rsp+14h]
+    mov         [rdi-2],eax
+    mov         eax,[rsp+64h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+24h]
+    mov         [rdi+r10*2-2],eax
+    mov         eax, [rsp+74h]
+    mov         [rdx+rdi-2],eax
+    mov         eax, [rsp+1Ch]
+    mov         [rbx],eax
+    mov         eax, [rsp+6Ch]
+    mov         [r10+rbx],eax
+    mov         eax,[rsp+2Ch]
+    mov         [rbx+r10*2],eax
+    mov         eax,[rsp+7Ch]
+    mov         [rdx+rbx],eax
+    lea         rsp,[rsp+140h]
+    POP_XMM
+    mov         rbx, [rsp+28h]
+    pop         rdi
+    ret
 
 
 
 WELS_EXTERN DeblockChromaLt4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        rsi
-  push        rdi
-  push        r12
-  PUSH_XMM 16
-  sub         rsp,170h
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        rsi
+    push        rdi
+    push        r12
+    PUSH_XMM 16
+    sub         rsp,170h
 
-  movsxd      rsi,r8d
-  lea         eax,[r8*4]
-  mov         r11d,r9d
-  movsxd      r10,eax
-  mov         eax, [rcx-2]
-  mov         r12,rdx
-  mov         [rsp+40h],eax
-  mov         eax, [rsi+rcx-2]
-  lea         rbx,[r10+rcx-2]
-  movdqa      xmm5,[rsp+40h]
-  mov         [rsp+50h],eax
-  mov         eax, [rcx+rsi*2-2]
-  lea         rbp,[r10+rdx-2]
-  movdqa      xmm2, [rsp+50h]
-  mov         [rsp+60h],eax
-  lea         r10,[rsi+rsi*2]
-  mov         rdi,rcx
-  mov         eax,[r10+rcx-2]
-  movdqa      xmm4,[rsp+60h]
-  mov         [rsp+70h],eax
-  mov         eax,[rdx-2]
-  mov         [rsp+80h],eax
-  mov         eax, [rsi+rdx-2]
-  movdqa      xmm3,[rsp+70h]
-  mov         [rsp+90h],eax
-  mov         eax,[rdx+rsi*2-2]
-  punpckldq   xmm5,[rsp+80h]
-  mov         [rsp+0A0h],eax
-  mov         eax, [r10+rdx-2]
-  punpckldq   xmm2,[rsp+90h]
-  mov         [rsp+0B0h],eax
-  mov         eax, [rbx]
-  punpckldq   xmm4,[rsp+0A0h]
-  mov         [rsp+80h],eax
-  mov         eax,[rbp]
-  punpckldq   xmm3,[rsp+0B0h]
-  mov         [rsp+90h],eax
-  mov         eax,[rsi+rbx]
-  movdqa      xmm0,[rsp+80h]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rsi+rbp]
-  movdqa      xmm0,[rsp+80h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+90h],eax
-  mov         eax,[rbx+rsi*2]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rbp+rsi*2]
-  movdqa      xmm0, [rsp+80h]
-  mov         [rsp+90h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm7,xmm1
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax, [r10+rbp]
-  movdqa      xmm0,[rsp+80h]
-  mov         [rsp+90h],eax
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm7,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm6,xmm7
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm6,xmm0
-  punpckhdq   xmm7,xmm0
-  movdqa      xmm0,xmm1
-  punpckldq   xmm0,xmm5
-  mov         rax, [rsp+1C8h+160]    ; pTC
-  punpckhdq   xmm1,xmm5
-  movdqa      xmm9,xmm6
-  punpckhqdq  xmm6,xmm0
-  punpcklqdq  xmm9,xmm0
-  movdqa      xmm2,xmm7
-  movdqa      xmm13,xmm6
-  movdqa      xmm4,xmm9
-  movdqa      [rsp+10h],xmm9
-  punpcklqdq  xmm2,xmm1
-  punpckhqdq  xmm7,xmm1
-  pxor        xmm1,xmm1
-  movsx       ecx,byte [rax+3]
-  movsx       edx,byte [rax+2]
-  movsx       r8d,byte [rax+1]
-  movsx       r9d,byte [rax]
-  movdqa      xmm10,xmm1
-  movdqa      xmm15,xmm2
-  punpckhbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm4,xmm1
-  movsx       eax,r11w
-  mov         word [rsp+0Eh],cx
-  mov         word [rsp+0Ch],cx
-  movdqa      xmm3,xmm7
-  movdqa      xmm8,xmm7
-  movdqa      [rsp+20h],xmm7
-  punpcklbw   xmm15,xmm1
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm3,xmm1
-  mov         word [rsp+0Ah],dx
-  mov         word [rsp+8],dx
-  mov         word [rsp+6],r8w
-  movd        xmm0,eax
-  movdqa      [rsp+30h],xmm6
-  punpckhbw   xmm9,xmm1
-  punpckhbw   xmm8,xmm1
-  punpcklwd   xmm0,xmm0
-  movsx       eax,word [rsp+1C0h+160]   ; iBeta
-  mov         word [rsp+4],r8w
-  mov         word [rsp+2],r9w
-  pshufd      xmm12,xmm0,0
-  mov         word [rsp],r9w
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  movdqa      xmm14, [rsp]
-  movdqa      [rsp],xmm2
-  movdqa      xmm2,xmm12
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  psubw       xmm10,xmm14
-  movd        xmm0,eax
-  movdqa      xmm7,xmm14
-  movdqa      xmm6,xmm14
-  pcmpgtw     xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  movdqa      xmm0,xmm4
-  movdqa      xmm1,xmm15
-  psubw       xmm4,xmm13
-  psubw       xmm0,xmm3
-  psubw       xmm1,xmm13
-  psubw       xmm3,xmm15
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm10
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm11
-  movdqa      xmm0,xmm13
-  psubw       xmm0,xmm15
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm11
-  movdqa      xmm3,[rsp+30h]
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm9
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm8
-  psubw       xmm9,xmm3
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  psubw       xmm15,xmm6
-  paddw       xmm13,xmm6
-  movdqa      xmm2,[rsp]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  psubw       xmm8,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm3
-  movdqa      xmm5,[rsp+10h]
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  movdqa      xmm4,xmm5
-  pabsw       xmm0,xmm0
-  pmaxsw      xmm10,xmm1
-  movdqa      xmm1,xmm11
-  pcmpgtw     xmm12,xmm0
-  pabsw       xmm0,xmm9
-  pminsw      xmm14,xmm10
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm8
-  pcmpgtw     xmm11,xmm0
-  pand        xmm12,xmm1
-  movdqa      xmm1,[rsp+20h]
-  pand        xmm12,xmm11
-  pand        xmm12,xmm7
-  pand        xmm14,xmm12
-  paddw       xmm3,xmm14
-  psubw       xmm2,xmm14
-  packuswb    xmm13,xmm3
-  packuswb    xmm15,xmm2
-  punpcklbw   xmm4,xmm13
-  punpckhbw   xmm5,xmm13
-  movdqa      xmm0,xmm15
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm4
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm2,xmm3
-  movdqa      xmm1,xmm4
-  punpcklwd   xmm0,xmm15
-  punpckhwd   xmm5,xmm15
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm5
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm5
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+40h],xmm0
-  movdqa      xmm0,xmm3
-  movdqa      [rsp+90h],xmm2
-  mov         eax,[rsp+40h]
-  mov         [rdi-2],eax
-  mov         eax, [rsp+90h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [rsi+rdi-2],eax
-  movdqa      [rsp+50h],xmm0
-  mov         eax,[rsp+50h]
-  movdqa      [rsp+0A0h],xmm3
-  mov         [rdi+rsi*2-2],eax
-  mov         eax,[rsp+0A0h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+48h]
-  mov         [rbx],eax
-  mov         eax,[rsp+98h]
-  mov         [rsi+rbx],eax
-  mov         eax,[rsp+58h]
-  mov         [rbx+rsi*2],eax
-  mov         eax, [rsp+0A8h]
-  mov         [r10+rbx],eax
-  mov         eax, [rsp+44h]
-  mov         [r12-2],eax
-  mov         eax,[rsp+94h]
-  mov         [rsi+r12-2],eax
-  mov         eax,[rsp+54h]
-  mov         [r12+rsi*2-2],eax
-  mov         eax, [rsp+0A4h]
-  mov         [r10+r12-2],eax
-  mov         eax,[rsp+4Ch]
-  mov         [rbp],eax
-  mov         eax,[rsp+9Ch]
-  mov         [rsi+rbp],eax
-  mov         eax, [rsp+5Ch]
-  mov         [rbp+rsi*2],eax
-  mov         eax,[rsp+0ACh]
-  mov         [r10+rbp],eax
-  lea         r11,[rsp+170h]
-  mov         rsp,r11
-  POP_XMM
-  pop         r12
-  pop         rdi
-  pop         rsi
-  pop         rbp
-  pop         rbx
-  ret
+    movsxd      rsi,r8d
+    lea         eax,[r8*4]
+    mov         r11d,r9d
+    movsxd      r10,eax
+    mov         eax, [rcx-2]
+    mov         r12,rdx
+    mov         [rsp+40h],eax
+    mov         eax, [rsi+rcx-2]
+    lea         rbx,[r10+rcx-2]
+    movdqa      xmm5,[rsp+40h]
+    mov         [rsp+50h],eax
+    mov         eax, [rcx+rsi*2-2]
+    lea         rbp,[r10+rdx-2]
+    movdqa      xmm2, [rsp+50h]
+    mov         [rsp+60h],eax
+    lea         r10,[rsi+rsi*2]
+    mov         rdi,rcx
+    mov         eax,[r10+rcx-2]
+    movdqa      xmm4,[rsp+60h]
+    mov         [rsp+70h],eax
+    mov         eax,[rdx-2]
+    mov         [rsp+80h],eax
+    mov         eax, [rsi+rdx-2]
+    movdqa      xmm3,[rsp+70h]
+    mov         [rsp+90h],eax
+    mov         eax,[rdx+rsi*2-2]
+    punpckldq   xmm5,[rsp+80h]
+    mov         [rsp+0A0h],eax
+    mov         eax, [r10+rdx-2]
+    punpckldq   xmm2,[rsp+90h]
+    mov         [rsp+0B0h],eax
+    mov         eax, [rbx]
+    punpckldq   xmm4,[rsp+0A0h]
+    mov         [rsp+80h],eax
+    mov         eax,[rbp]
+    punpckldq   xmm3,[rsp+0B0h]
+    mov         [rsp+90h],eax
+    mov         eax,[rsi+rbx]
+    movdqa      xmm0,[rsp+80h]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rsi+rbp]
+    movdqa      xmm0,[rsp+80h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+90h],eax
+    mov         eax,[rbx+rsi*2]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rbp+rsi*2]
+    movdqa      xmm0, [rsp+80h]
+    mov         [rsp+90h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm7,xmm1
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax, [r10+rbp]
+    movdqa      xmm0,[rsp+80h]
+    mov         [rsp+90h],eax
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm7,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm6,xmm7
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm6,xmm0
+    punpckhdq   xmm7,xmm0
+    movdqa      xmm0,xmm1
+    punpckldq   xmm0,xmm5
+    mov         rax, [rsp+1C8h+160]    ; pTC
+    punpckhdq   xmm1,xmm5
+    movdqa      xmm9,xmm6
+    punpckhqdq  xmm6,xmm0
+    punpcklqdq  xmm9,xmm0
+    movdqa      xmm2,xmm7
+    movdqa      xmm13,xmm6
+    movdqa      xmm4,xmm9
+    movdqa      [rsp+10h],xmm9
+    punpcklqdq  xmm2,xmm1
+    punpckhqdq  xmm7,xmm1
+    pxor        xmm1,xmm1
+    movsx       ecx,byte [rax+3]
+    movsx       edx,byte [rax+2]
+    movsx       r8d,byte [rax+1]
+    movsx       r9d,byte [rax]
+    movdqa      xmm10,xmm1
+    movdqa      xmm15,xmm2
+    punpckhbw   xmm2,xmm1
+    punpckhbw   xmm6,xmm1
+    punpcklbw   xmm4,xmm1
+    movsx       eax,r11w
+    mov         word [rsp+0Eh],cx
+    mov         word [rsp+0Ch],cx
+    movdqa      xmm3,xmm7
+    movdqa      xmm8,xmm7
+    movdqa      [rsp+20h],xmm7
+    punpcklbw   xmm15,xmm1
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm3,xmm1
+    mov         word [rsp+0Ah],dx
+    mov         word [rsp+8],dx
+    mov         word [rsp+6],r8w
+    movd        xmm0,eax
+    movdqa      [rsp+30h],xmm6
+    punpckhbw   xmm9,xmm1
+    punpckhbw   xmm8,xmm1
+    punpcklwd   xmm0,xmm0
+    movsx       eax,word [rsp+1C0h+160]   ; iBeta
+    mov         word [rsp+4],r8w
+    mov         word [rsp+2],r9w
+    pshufd      xmm12,xmm0,0
+    mov         word [rsp],r9w
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    movdqa      xmm14, [rsp]
+    movdqa      [rsp],xmm2
+    movdqa      xmm2,xmm12
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    psubw       xmm10,xmm14
+    movd        xmm0,eax
+    movdqa      xmm7,xmm14
+    movdqa      xmm6,xmm14
+    pcmpgtw     xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    movdqa      xmm0,xmm4
+    movdqa      xmm1,xmm15
+    psubw       xmm4,xmm13
+    psubw       xmm0,xmm3
+    psubw       xmm1,xmm13
+    psubw       xmm3,xmm15
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm10
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm11
+    movdqa      xmm0,xmm13
+    psubw       xmm0,xmm15
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm11
+    movdqa      xmm3,[rsp+30h]
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm9
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm8
+    psubw       xmm9,xmm3
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    psubw       xmm15,xmm6
+    paddw       xmm13,xmm6
+    movdqa      xmm2,[rsp]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    psubw       xmm8,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm3
+    movdqa      xmm5,[rsp+10h]
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    movdqa      xmm4,xmm5
+    pabsw       xmm0,xmm0
+    pmaxsw      xmm10,xmm1
+    movdqa      xmm1,xmm11
+    pcmpgtw     xmm12,xmm0
+    pabsw       xmm0,xmm9
+    pminsw      xmm14,xmm10
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm8
+    pcmpgtw     xmm11,xmm0
+    pand        xmm12,xmm1
+    movdqa      xmm1,[rsp+20h]
+    pand        xmm12,xmm11
+    pand        xmm12,xmm7
+    pand        xmm14,xmm12
+    paddw       xmm3,xmm14
+    psubw       xmm2,xmm14
+    packuswb    xmm13,xmm3
+    packuswb    xmm15,xmm2
+    punpcklbw   xmm4,xmm13
+    punpckhbw   xmm5,xmm13
+    movdqa      xmm0,xmm15
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm4
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm2,xmm3
+    movdqa      xmm1,xmm4
+    punpcklwd   xmm0,xmm15
+    punpckhwd   xmm5,xmm15
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm5
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm5
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+40h],xmm0
+    movdqa      xmm0,xmm3
+    movdqa      [rsp+90h],xmm2
+    mov         eax,[rsp+40h]
+    mov         [rdi-2],eax
+    mov         eax, [rsp+90h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [rsi+rdi-2],eax
+    movdqa      [rsp+50h],xmm0
+    mov         eax,[rsp+50h]
+    movdqa      [rsp+0A0h],xmm3
+    mov         [rdi+rsi*2-2],eax
+    mov         eax,[rsp+0A0h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+48h]
+    mov         [rbx],eax
+    mov         eax,[rsp+98h]
+    mov         [rsi+rbx],eax
+    mov         eax,[rsp+58h]
+    mov         [rbx+rsi*2],eax
+    mov         eax, [rsp+0A8h]
+    mov         [r10+rbx],eax
+    mov         eax, [rsp+44h]
+    mov         [r12-2],eax
+    mov         eax,[rsp+94h]
+    mov         [rsi+r12-2],eax
+    mov         eax,[rsp+54h]
+    mov         [r12+rsi*2-2],eax
+    mov         eax, [rsp+0A4h]
+    mov         [r10+r12-2],eax
+    mov         eax,[rsp+4Ch]
+    mov         [rbp],eax
+    mov         eax,[rsp+9Ch]
+    mov         [rsi+rbp],eax
+    mov         eax, [rsp+5Ch]
+    mov         [rbp+rsi*2],eax
+    mov         eax,[rsp+0ACh]
+    mov         [r10+rbp],eax
+    lea         r11,[rsp+170h]
+    mov         rsp,r11
+    POP_XMM
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
+    pop         rbx
+    ret
 
 
 
@@ -1638,1591 +1638,1591 @@
 
 
 WELS_EXTERN   DeblockLumaLt4V_ssse3
-  push        rbp
-  mov         r11,r8  ; pTC
-  sub         rsp,1B0h
-  lea         rbp,[rsp+20h]
-  movd        xmm4,edx
-  movd        xmm2,ecx
-  mov         qword [rbp+180h],r12
-  mov         r10,rdi
-  movsxd      r12,esi
-  add         rsi,rsi
-  movsxd      rdx,esi
-  sub         r10,r12
-  movsx       r8d,byte [r11]
-  pxor        xmm3,xmm3
-  punpcklwd   xmm2,xmm2
-  movaps      [rbp+50h],xmm14
-  lea         rax,[r12+r12*2]
-  movdqa      xmm14,[rdx+rdi]
-  neg         rax
-  pshufd      xmm0,xmm2,0
-  movd        xmm2,r8d
-  movsx       rsi,byte [r11+1]
-  movsx       r8d,byte [r11+2]
-  movsx       r11d,byte [r11+3]
-  movaps      [rbp+70h],xmm12
-  movd        xmm1,esi
-  movaps      [rbp+80h],xmm11
-  movd        xmm12,r8d
-  movd        xmm11,r11d
-  movdqa      xmm5, [rax+rdi]
-  lea         rax,[r12+r12]
-  punpcklwd   xmm12,xmm12
-  neg         rax
-  punpcklwd   xmm11,xmm11
-  movaps      [rbp],xmm8
-  movdqa      xmm8, [r10]
-  punpcklwd   xmm2,xmm2
-  punpcklwd   xmm1,xmm1
-  punpcklqdq  xmm12,xmm12
-  punpcklqdq  xmm11,xmm11
-  punpcklqdq  xmm2,xmm2
-  punpcklqdq  xmm1,xmm1
-  shufps      xmm12,xmm11,88h
-  movdqa      xmm11,xmm8
-  movaps      [rbp+30h],xmm9
-  movdqa      xmm9,[rdi]
-  shufps      xmm2,xmm1,88h
-  movdqa      xmm1,xmm5
-  punpcklbw   xmm11,xmm3
-  movaps      [rbp+20h],xmm6
-  movaps      [rbp+60h],xmm13
-  movdqa      xmm13,xmm11
-  movaps      [rbp+90h],xmm10
-  movdqa      xmm10,xmm9
-  movdqa      xmm6,[rax+rdi]
-  punpcklbw   xmm1,xmm3
-  movaps      [rbp+0A0h],xmm12
-  psubw       xmm13,xmm1
-  movaps      [rbp+40h],xmm15
-  movdqa      xmm15,xmm14
-  movaps      [rbp+10h],xmm7
-  movdqa      xmm7,xmm6
-  punpcklbw   xmm10,xmm3
-  movdqa      xmm12,[r12+rdi]
-  punpcklbw   xmm7,xmm3
-  punpcklbw   xmm12,xmm3
-  punpcklbw   xmm15,xmm3
-  pabsw       xmm3,xmm13
-  movdqa      xmm13,xmm10
-  psubw       xmm13,xmm15
-  movdqa      [rbp+0F0h],xmm15
-  pabsw       xmm15,xmm13
-  movdqa      xmm13,xmm11
-  movdqa      [rbp+0B0h],xmm1
-  movdqa      xmm1,xmm0
-  pavgw       xmm13,xmm10
-  pcmpgtw     xmm1,xmm3
-  movdqa      [rbp+120h],xmm13
-  movaps      xmm13,xmm2
-  punpcklwd   xmm4,xmm4
-  movdqa      xmm3,xmm0
-  movdqa      [rbp+100h],xmm1
-  psubw       xmm13,xmm1
-  movdqa      xmm1,xmm10
-  pcmpgtw     xmm3,xmm15
-  pshufd      xmm4,xmm4,0
-  psubw       xmm1,xmm11
-  movdqa      [rbp+0D0h],xmm10
-  psubw       xmm13,xmm3
-  movdqa      [rbp+110h],xmm3
-  pabsw       xmm15,xmm1
-  movdqa      xmm3,xmm4
-  psubw       xmm10,xmm12
-  pcmpgtw     xmm3,xmm15
-  pabsw       xmm15,xmm10
-  movdqa      xmm10,xmm0
-  psllw       xmm1,2
-  movdqa      [rbp+0C0h],xmm11
-  psubw       xmm11,xmm7
-  pcmpgtw     xmm10,xmm15
-  pabsw       xmm11,xmm11
-  movdqa      xmm15,xmm0
-  pand        xmm3,xmm10
-  pcmpgtw     xmm15,xmm11
-  movaps      xmm11,xmm2
-  pxor        xmm10,xmm10
-  pand        xmm3,xmm15
-  pcmpgtw     xmm11,xmm10
-  pcmpeqw     xmm10,xmm2
-  por         xmm11,xmm10
-  pand        xmm3,xmm11
-  movdqa      xmm11,xmm7
-  psubw       xmm11,xmm12
-  pxor        xmm15,xmm15
-  paddw       xmm11,xmm1
-  psubw       xmm15,xmm13
-  movdqa      [rbp+0E0h],xmm12
-  paddw       xmm11,[FOUR_16B_SSE2]
-  pxor        xmm12,xmm12
-  psraw       xmm11,3
-  punpckhbw   xmm8,xmm12
-  pmaxsw      xmm15,xmm11
-  punpckhbw   xmm5,xmm12
-  movdqa      xmm11,xmm8
-  pminsw      xmm13,xmm15
-  psubw       xmm11,xmm5
-  punpckhbw   xmm9,xmm12
-  pand        xmm13,xmm3
-  movdqa      [rbp+130h],xmm13
-  pabsw       xmm13,xmm11
-  punpckhbw   xmm14,xmm12
-  movdqa      xmm11,xmm9
-  psubw       xmm11,xmm14
-  movdqa      xmm15,xmm0
-  movdqa      [rbp+140h],xmm14
-  pabsw       xmm14,xmm11
-  movdqa      xmm11,xmm8
-  pcmpgtw     xmm15,xmm14
-  movdqa      xmm1,[r12+rdi]
-  pavgw       xmm11,xmm9
-  movdqa      [rbp+170h],xmm11
-  movdqa      xmm10,xmm9
-  punpckhbw   xmm6,xmm12
-  psubw       xmm10,xmm8
-  punpckhbw   xmm1,xmm12
-  movdqa      xmm12,xmm0
-  movaps      xmm11,[rbp+0A0h]
-  pcmpgtw     xmm12,xmm13
-  movaps      xmm13,xmm11
-  psubw       xmm13,xmm12
-  movdqa      [rbp+160h],xmm15
-  psubw       xmm13,xmm15
-  movdqa      xmm15,xmm9
-  psubw       xmm15,xmm1
-  movdqa      [rbp+150h],xmm12
-  pabsw       xmm12,xmm10
-  pabsw       xmm14,xmm15
-  movdqa      xmm15,xmm8
-  pcmpgtw     xmm4,xmm12
-  movdqa      xmm12,xmm0
-  psubw       xmm15,xmm6
-  pcmpgtw     xmm12,xmm14
-  pabsw       xmm14,xmm15
-  psllw       xmm10,2
-  pcmpgtw     xmm0,xmm14
-  movdqa      xmm14,xmm6
-  psubw       xmm14,xmm1
-  pand        xmm4,xmm12
-  paddw       xmm14,xmm10
-  pand        xmm4,xmm0
-  paddw       xmm14,[FOUR_16B_SSE2]
-  pxor        xmm15,xmm15
-  movaps      xmm12,xmm11
-  psubw       xmm15,xmm13
-  pxor        xmm0,xmm0
-  psraw       xmm14,3
-  pcmpgtw     xmm12,xmm0
-  pcmpeqw     xmm0,xmm11
-  pmaxsw      xmm15,xmm14
-  por         xmm12,xmm0
-  movdqa      xmm0,[rbp+120h]
-  pminsw      xmm13,xmm15
-  movdqa      xmm15,[rbp+0B0h]
-  movdqa      xmm10,xmm7
-  pand        xmm4,xmm12
-  paddw       xmm15,xmm0
-  pxor        xmm12,xmm12
-  paddw       xmm10,xmm7
-  movdqa      xmm14,xmm12
-  psubw       xmm15,xmm10
-  psubw       xmm14,xmm2
-  psraw       xmm15,1
-  pmaxsw      xmm15,xmm14
-  movdqa      xmm10,xmm6
-  pminsw      xmm15,xmm2
-  paddw       xmm10,xmm6
-  pand        xmm15,xmm3
-  psubw       xmm12,xmm11
-  pand        xmm15,[rbp+100h]
-  pand        xmm13,xmm4
-  paddw       xmm7,xmm15
-  paddw       xmm8,xmm13
-  movdqa      xmm15,[rbp+170h]
-  psubw       xmm9,xmm13
-  paddw       xmm5,xmm15
-  psubw       xmm5,xmm10
-  psraw       xmm5,1
-  pmaxsw      xmm5,xmm12
-  pminsw      xmm5,xmm11
-  pand        xmm5,xmm4
-  pand        xmm5,[rbp+150h]
-  paddw       xmm6,xmm5
-  movdqa      xmm5,[rbp+0C0h]
-  packuswb    xmm7,xmm6
-  movdqa      xmm6,[rbp+130h]
-  paddw       xmm5,xmm6
-  packuswb    xmm5,xmm8
-  movdqa      xmm8,[rbp+0D0h]
-  psubw       xmm8,xmm6
-  movdqa      xmm6,[rbp+0F0h]
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[rbp+0E0h]
-  packuswb    xmm8,xmm9
-  movdqa      xmm9,xmm0
-  paddw       xmm9,xmm0
-  psubw       xmm6,xmm9
-  psraw       xmm6,1
-  pmaxsw      xmm14,xmm6
-  pminsw      xmm2,xmm14
-  pand        xmm2,xmm3
-  pand        xmm2,[rbp+110h]
-  paddw       xmm0,xmm2
-  movdqa      xmm2,[rbp+140h]
-  paddw       xmm2,xmm15
-  movdqa      xmm15,xmm1
-  paddw       xmm15,xmm1
-  psubw       xmm2,xmm15
-  psraw       xmm2,1
-  pmaxsw      xmm12,xmm2
-  pminsw      xmm11,xmm12
-  pand        xmm11,xmm4
-  pand        xmm11,[rbp+160h]
-  paddw       xmm1,xmm11
-  movdqa      [rax+rdi],xmm7
-  movdqa      [r10],xmm5
-  packuswb    xmm0,xmm1
-  movdqa      [rdi],xmm8
-  movdqa      [r12+rdi],xmm0
-  mov         r12,qword [rbp+180h]
-  lea         rsp,[rbp+190h]
-  pop         rbp
-  ret
+    push        rbp
+    mov         r11,r8  ; pTC
+    sub         rsp,1B0h
+    lea         rbp,[rsp+20h]
+    movd        xmm4,edx
+    movd        xmm2,ecx
+    mov         qword [rbp+180h],r12
+    mov         r10,rdi
+    movsxd      r12,esi
+    add         rsi,rsi
+    movsxd      rdx,esi
+    sub         r10,r12
+    movsx       r8d,byte [r11]
+    pxor        xmm3,xmm3
+    punpcklwd   xmm2,xmm2
+    movaps      [rbp+50h],xmm14
+    lea         rax,[r12+r12*2]
+    movdqa      xmm14,[rdx+rdi]
+    neg         rax
+    pshufd      xmm0,xmm2,0
+    movd        xmm2,r8d
+    movsx       rsi,byte [r11+1]
+    movsx       r8d,byte [r11+2]
+    movsx       r11d,byte [r11+3]
+    movaps      [rbp+70h],xmm12
+    movd        xmm1,esi
+    movaps      [rbp+80h],xmm11
+    movd        xmm12,r8d
+    movd        xmm11,r11d
+    movdqa      xmm5, [rax+rdi]
+    lea         rax,[r12+r12]
+    punpcklwd   xmm12,xmm12
+    neg         rax
+    punpcklwd   xmm11,xmm11
+    movaps      [rbp],xmm8
+    movdqa      xmm8, [r10]
+    punpcklwd   xmm2,xmm2
+    punpcklwd   xmm1,xmm1
+    punpcklqdq  xmm12,xmm12
+    punpcklqdq  xmm11,xmm11
+    punpcklqdq  xmm2,xmm2
+    punpcklqdq  xmm1,xmm1
+    shufps      xmm12,xmm11,88h
+    movdqa      xmm11,xmm8
+    movaps      [rbp+30h],xmm9
+    movdqa      xmm9,[rdi]
+    shufps      xmm2,xmm1,88h
+    movdqa      xmm1,xmm5
+    punpcklbw   xmm11,xmm3
+    movaps      [rbp+20h],xmm6
+    movaps      [rbp+60h],xmm13
+    movdqa      xmm13,xmm11
+    movaps      [rbp+90h],xmm10
+    movdqa      xmm10,xmm9
+    movdqa      xmm6,[rax+rdi]
+    punpcklbw   xmm1,xmm3
+    movaps      [rbp+0A0h],xmm12
+    psubw       xmm13,xmm1
+    movaps      [rbp+40h],xmm15
+    movdqa      xmm15,xmm14
+    movaps      [rbp+10h],xmm7
+    movdqa      xmm7,xmm6
+    punpcklbw   xmm10,xmm3
+    movdqa      xmm12,[r12+rdi]
+    punpcklbw   xmm7,xmm3
+    punpcklbw   xmm12,xmm3
+    punpcklbw   xmm15,xmm3
+    pabsw       xmm3,xmm13
+    movdqa      xmm13,xmm10
+    psubw       xmm13,xmm15
+    movdqa      [rbp+0F0h],xmm15
+    pabsw       xmm15,xmm13
+    movdqa      xmm13,xmm11
+    movdqa      [rbp+0B0h],xmm1
+    movdqa      xmm1,xmm0
+    pavgw       xmm13,xmm10
+    pcmpgtw     xmm1,xmm3
+    movdqa      [rbp+120h],xmm13
+    movaps      xmm13,xmm2
+    punpcklwd   xmm4,xmm4
+    movdqa      xmm3,xmm0
+    movdqa      [rbp+100h],xmm1
+    psubw       xmm13,xmm1
+    movdqa      xmm1,xmm10
+    pcmpgtw     xmm3,xmm15
+    pshufd      xmm4,xmm4,0
+    psubw       xmm1,xmm11
+    movdqa      [rbp+0D0h],xmm10
+    psubw       xmm13,xmm3
+    movdqa      [rbp+110h],xmm3
+    pabsw       xmm15,xmm1
+    movdqa      xmm3,xmm4
+    psubw       xmm10,xmm12
+    pcmpgtw     xmm3,xmm15
+    pabsw       xmm15,xmm10
+    movdqa      xmm10,xmm0
+    psllw       xmm1,2
+    movdqa      [rbp+0C0h],xmm11
+    psubw       xmm11,xmm7
+    pcmpgtw     xmm10,xmm15
+    pabsw       xmm11,xmm11
+    movdqa      xmm15,xmm0
+    pand        xmm3,xmm10
+    pcmpgtw     xmm15,xmm11
+    movaps      xmm11,xmm2
+    pxor        xmm10,xmm10
+    pand        xmm3,xmm15
+    pcmpgtw     xmm11,xmm10
+    pcmpeqw     xmm10,xmm2
+    por         xmm11,xmm10
+    pand        xmm3,xmm11
+    movdqa      xmm11,xmm7
+    psubw       xmm11,xmm12
+    pxor        xmm15,xmm15
+    paddw       xmm11,xmm1
+    psubw       xmm15,xmm13
+    movdqa      [rbp+0E0h],xmm12
+    paddw       xmm11,[FOUR_16B_SSE2]
+    pxor        xmm12,xmm12
+    psraw       xmm11,3
+    punpckhbw   xmm8,xmm12
+    pmaxsw      xmm15,xmm11
+    punpckhbw   xmm5,xmm12
+    movdqa      xmm11,xmm8
+    pminsw      xmm13,xmm15
+    psubw       xmm11,xmm5
+    punpckhbw   xmm9,xmm12
+    pand        xmm13,xmm3
+    movdqa      [rbp+130h],xmm13
+    pabsw       xmm13,xmm11
+    punpckhbw   xmm14,xmm12
+    movdqa      xmm11,xmm9
+    psubw       xmm11,xmm14
+    movdqa      xmm15,xmm0
+    movdqa      [rbp+140h],xmm14
+    pabsw       xmm14,xmm11
+    movdqa      xmm11,xmm8
+    pcmpgtw     xmm15,xmm14
+    movdqa      xmm1,[r12+rdi]
+    pavgw       xmm11,xmm9
+    movdqa      [rbp+170h],xmm11
+    movdqa      xmm10,xmm9
+    punpckhbw   xmm6,xmm12
+    psubw       xmm10,xmm8
+    punpckhbw   xmm1,xmm12
+    movdqa      xmm12,xmm0
+    movaps      xmm11,[rbp+0A0h]
+    pcmpgtw     xmm12,xmm13
+    movaps      xmm13,xmm11
+    psubw       xmm13,xmm12
+    movdqa      [rbp+160h],xmm15
+    psubw       xmm13,xmm15
+    movdqa      xmm15,xmm9
+    psubw       xmm15,xmm1
+    movdqa      [rbp+150h],xmm12
+    pabsw       xmm12,xmm10
+    pabsw       xmm14,xmm15
+    movdqa      xmm15,xmm8
+    pcmpgtw     xmm4,xmm12
+    movdqa      xmm12,xmm0
+    psubw       xmm15,xmm6
+    pcmpgtw     xmm12,xmm14
+    pabsw       xmm14,xmm15
+    psllw       xmm10,2
+    pcmpgtw     xmm0,xmm14
+    movdqa      xmm14,xmm6
+    psubw       xmm14,xmm1
+    pand        xmm4,xmm12
+    paddw       xmm14,xmm10
+    pand        xmm4,xmm0
+    paddw       xmm14,[FOUR_16B_SSE2]
+    pxor        xmm15,xmm15
+    movaps      xmm12,xmm11
+    psubw       xmm15,xmm13
+    pxor        xmm0,xmm0
+    psraw       xmm14,3
+    pcmpgtw     xmm12,xmm0
+    pcmpeqw     xmm0,xmm11
+    pmaxsw      xmm15,xmm14
+    por         xmm12,xmm0
+    movdqa      xmm0,[rbp+120h]
+    pminsw      xmm13,xmm15
+    movdqa      xmm15,[rbp+0B0h]
+    movdqa      xmm10,xmm7
+    pand        xmm4,xmm12
+    paddw       xmm15,xmm0
+    pxor        xmm12,xmm12
+    paddw       xmm10,xmm7
+    movdqa      xmm14,xmm12
+    psubw       xmm15,xmm10
+    psubw       xmm14,xmm2
+    psraw       xmm15,1
+    pmaxsw      xmm15,xmm14
+    movdqa      xmm10,xmm6
+    pminsw      xmm15,xmm2
+    paddw       xmm10,xmm6
+    pand        xmm15,xmm3
+    psubw       xmm12,xmm11
+    pand        xmm15,[rbp+100h]
+    pand        xmm13,xmm4
+    paddw       xmm7,xmm15
+    paddw       xmm8,xmm13
+    movdqa      xmm15,[rbp+170h]
+    psubw       xmm9,xmm13
+    paddw       xmm5,xmm15
+    psubw       xmm5,xmm10
+    psraw       xmm5,1
+    pmaxsw      xmm5,xmm12
+    pminsw      xmm5,xmm11
+    pand        xmm5,xmm4
+    pand        xmm5,[rbp+150h]
+    paddw       xmm6,xmm5
+    movdqa      xmm5,[rbp+0C0h]
+    packuswb    xmm7,xmm6
+    movdqa      xmm6,[rbp+130h]
+    paddw       xmm5,xmm6
+    packuswb    xmm5,xmm8
+    movdqa      xmm8,[rbp+0D0h]
+    psubw       xmm8,xmm6
+    movdqa      xmm6,[rbp+0F0h]
+    paddw       xmm6,xmm0
+    movdqa      xmm0,[rbp+0E0h]
+    packuswb    xmm8,xmm9
+    movdqa      xmm9,xmm0
+    paddw       xmm9,xmm0
+    psubw       xmm6,xmm9
+    psraw       xmm6,1
+    pmaxsw      xmm14,xmm6
+    pminsw      xmm2,xmm14
+    pand        xmm2,xmm3
+    pand        xmm2,[rbp+110h]
+    paddw       xmm0,xmm2
+    movdqa      xmm2,[rbp+140h]
+    paddw       xmm2,xmm15
+    movdqa      xmm15,xmm1
+    paddw       xmm15,xmm1
+    psubw       xmm2,xmm15
+    psraw       xmm2,1
+    pmaxsw      xmm12,xmm2
+    pminsw      xmm11,xmm12
+    pand        xmm11,xmm4
+    pand        xmm11,[rbp+160h]
+    paddw       xmm1,xmm11
+    movdqa      [rax+rdi],xmm7
+    movdqa      [r10],xmm5
+    packuswb    xmm0,xmm1
+    movdqa      [rdi],xmm8
+    movdqa      [r12+rdi],xmm0
+    mov         r12,qword [rbp+180h]
+    lea         rsp,[rbp+190h]
+    pop         rbp
+    ret
 
 
 WELS_EXTERN DeblockLumaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  mov         r8,   rdx
-  mov         r9,   rcx
-  mov         rcx,  rdi
-  mov         rdx,  rsi
-  sub         rsp,1D8h
-  movaps      [rax-38h],xmm6
-  movaps      [rax-48h],xmm7
-  movaps      [rax-58h],xmm8
-  pxor        xmm1,xmm1
-  movsxd      r10,edx
-  mov         rbp,rcx
-  mov         r11d,r8d
-  mov         rdx,rcx
-  mov         rdi,rbp
-  mov         rbx,rbp
-  movdqa      xmm5,[rbp]
-  movaps      [rax-68h],xmm9
-  movaps      [rax-78h],xmm10
-  punpcklbw   xmm5,xmm1
-  movaps      [rax-88h],xmm11
-  movaps      [rax-98h],xmm12
-  movaps      [rax-0A8h],xmm13
-  movaps      [rax-0B8h],xmm14
-  movdqa      xmm14,[r10+rbp]
-  movaps      [rax-0C8h],xmm15
-  lea         eax,[r10*4]
-  movsxd      r8,eax
-  lea         eax,[r10+r10*2]
-  movsxd      rcx,eax
-  lea         eax,[r10+r10]
-  sub         rdx,r8
-  punpcklbw   xmm14,xmm1
-  movdqa      [rsp+90h],xmm5
-  movdqa      [rsp+30h],xmm14
-  movsxd      rsi,eax
-  movsx       eax,r11w
-  sub         rdi,rcx
-  sub         rbx,rsi
-  mov         r8,rbp
-  sub         r8,r10
-  movd        xmm0,eax
-  movsx       eax,r9w
-  movdqa      xmm12,[rdi]
-  movdqa      xmm6, [rsi+rbp]
-  movdqa      xmm13,[rbx]
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm6,xmm1
-  movdqa      xmm8,[r8]
-  movd        xmm0,eax
-  movdqa      xmm10,xmm11
-  mov         eax,2
-  punpcklbw   xmm8,xmm1
-  punpcklbw   xmm12,xmm1
-  cwde
-  punpcklwd   xmm0,xmm0
-  psraw       xmm10,2
-  movdqa      xmm1,xmm8
-  movdqa      [rsp+0F0h],xmm13
-  movdqa      [rsp+0B0h],xmm8
-  pshufd      xmm7,xmm0,0
-  psubw       xmm1,xmm13
-  movdqa      xmm0,xmm5
-  movdqa      xmm4,xmm7
-  movdqa      xmm2,xmm7
-  psubw       xmm0,xmm8
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm5
-  movdqa      [rsp+40h],xmm7
-  movdqa      [rsp+60h],xmm6
-  pcmpgtw     xmm4,xmm0
-  psubw       xmm1,xmm14
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm4,xmm2
-  movdqa      xmm0,xmm11
-  pcmpgtw     xmm0,xmm3
-  pand        xmm4,xmm0
-  movd        xmm0,eax
-  movdqa      [rsp+20h],xmm4
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm2,xmm0,0
-  paddw       xmm10,xmm2
-  movdqa      [rsp+0A0h],xmm2
-  movdqa      xmm15,xmm7
-  pxor        xmm4,xmm4
-  movdqa      xmm0,xmm8
-  psubw       xmm0,xmm12
-  mov         eax,4
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm10
-  cwde
-  pcmpgtw     xmm15,xmm0
-  pcmpgtw     xmm1,xmm3
-  movdqa      xmm3,xmm7
-  movdqa      xmm7,[rdx]
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm6
-  pand        xmm15,xmm1
-  punpcklbw   xmm7,xmm4
-  movdqa      xmm9,xmm15
-  pabsw       xmm0,xmm0
-  psllw       xmm7,1
-  pandn       xmm9,xmm12
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm7,xmm12
-  movd        xmm0,eax
-  pand        xmm3,xmm1
-  paddw       xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  paddw       xmm7,xmm12
-  pshufd      xmm1,xmm0,0
-  paddw       xmm7,xmm13
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm6
-  paddw       xmm7,xmm8
-  movdqa      [rsp+70h],xmm1
-  paddw       xmm7,xmm5
-  movdqa      [rsp+120h],xmm0
-  movdqa      xmm0,[rcx+rbp]
-  punpcklbw   xmm0,xmm4
-  paddw       xmm7,xmm1
-  movdqa      xmm4,xmm15
-  psllw       xmm0,1
-  psraw       xmm7,3
-  paddw       xmm0,xmm6
-  pand        xmm7,xmm15
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm14
-  movdqa      xmm6,xmm15
-  paddw       xmm0,xmm5
-  pandn       xmm6,xmm13
-  paddw       xmm0,xmm8
-  paddw       xmm0,xmm1
-  psraw       xmm0,3
-  movdqa      xmm1,xmm12
-  paddw       xmm1,xmm13
-  pand        xmm0,xmm3
-  movdqa      [rsp+100h],xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,xmm5
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm3
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pandn       xmm0,xmm14
-  pand        xmm4,xmm1
-  movdqa      [rsp+0E0h],xmm0
-  movdqa      xmm0,xmm5
-  paddw       xmm0,xmm8
-  movdqa      xmm1,[rsp+60h]
-  paddw       xmm1,xmm14
-  movdqa      xmm14,xmm3
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,[rsp+30h]
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pand        xmm14,xmm1
-  movdqa      xmm1,xmm13
-  paddw       xmm1,xmm13
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  movdqa      xmm0,[rsp+30h]
-  movdqa      xmm2,xmm13
-  movdqa      xmm5,xmm15
-  paddw       xmm0,[rsp+70h]
-  pandn       xmm5,xmm1
-  paddw       xmm2,xmm8
-  movdqa      xmm8,[rsp+90h]
-  movdqa      xmm1,xmm12
-  paddw       xmm2,xmm8
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,xmm8
-  movdqa      xmm8,xmm3
-  movdqa      xmm2,[rsp+30h]
-  paddw       xmm0,xmm13
-  psraw       xmm1,3
-  pand        xmm15,xmm1
-  movdqa      xmm1,xmm2
-  paddw       xmm1,xmm2
-  paddw       xmm2,[rsp+90h]
-  paddw       xmm2,[rsp+0B0h]
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  movdqa      xmm13,[r8]
-  paddw       xmm0, [rsp+70h]
-  paddw       xmm1, [rsp+0A0h]
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  psraw       xmm1,2
-  movdqa      xmm0, [rdi]
-  pandn       xmm8,xmm1
-  movdqa      xmm1, [rsp+60h]
-  paddw       xmm1,xmm2
-  movdqa      xmm2, [rbx]
-  psraw       xmm1,3
-  pand        xmm3,xmm1
-  movdqa      xmm1, [rbp]
-  movdqa      [rsp+0D0h],xmm3
-  pxor        xmm3,xmm3
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm1,xmm3
-  punpckhbw   xmm13,xmm3
-  movdqa      [rsp+0C0h],xmm0
-  movdqa      xmm0,[r10+rbp]
-  movdqa      [rsp],xmm1
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm2,xmm3
-  movdqa      [rsp+80h],xmm0
-  movdqa      xmm0,[rsi+rbp]
-  movdqa      [rsp+10h],xmm13
-  punpckhbw   xmm0,xmm3
-  movdqa      [rsp+50h],xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm1,xmm13
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm2
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,[rsp]
-  movdqa      xmm13,[rsp+40h]
-  movdqa      [rsp+110h],xmm2
-  psubw       xmm1, [rsp+80h]
-  pcmpgtw     xmm13,xmm0
-  pcmpgtw     xmm11,xmm3
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm10,xmm3
-  movdqa      xmm1, [rsp+40h]
-  movdqa      xmm2,xmm1
-  movdqa      xmm3,xmm1
-  pcmpgtw     xmm2,xmm0
-  movdqa      xmm0, [rsp+10h]
-  pand        xmm13,xmm2
-  pand        xmm13,xmm11
-  movdqa      xmm11,[rsp+0C0h]
-  psubw       xmm0,xmm11
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm3,xmm0
-  pand        xmm3,xmm10
-  movdqa      xmm0,[rsp]
-  psubw       xmm0,[rsp+50h]
-  movdqa      xmm2,[rdx]
-  pabsw       xmm0,xmm0
-  por         xmm7,xmm9
-  movdqa      xmm9,[rsp+20h]
-  pcmpgtw     xmm1,xmm0
-  pand        xmm9,xmm7
-  movdqa      xmm7,[rsp+20h]
-  movdqa      xmm0,xmm7
-  pandn       xmm0,xmm12
-  movdqa      xmm12,[rsp+110h]
-  pand        xmm1,xmm10
-  movdqa      xmm10,[rsp+70h]
-  movdqa      [rsp+40h],xmm1
-  movdqa      xmm1,xmm13
-  por         xmm9,xmm0
-  pxor        xmm0,xmm0
-  por         xmm4,xmm6
-  movdqa      xmm6,xmm7
-  punpckhbw   xmm2,xmm0
-  por         xmm15,xmm5
-  movdqa      xmm5,[rsp+20h]
-  movdqa      xmm0,xmm3
-  psllw       xmm2,1
-  pandn       xmm0,xmm11
-  pand        xmm6,xmm4
-  movdqa      xmm4,[rsp]
-  paddw       xmm2,xmm11
-  pand        xmm5,xmm15
-  movdqa      xmm15,[rsp+20h]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm12
-  paddw       xmm2,[rsp+10h]
-  paddw       xmm2,[rsp]
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  pand        xmm2,xmm3
-  por         xmm2,xmm0
-  pand        xmm1,xmm2
-  movdqa      xmm0,xmm13
-  movdqa      xmm2,xmm11
-  pandn       xmm0,xmm11
-  paddw       xmm2,xmm12
-  por         xmm1,xmm0
-  packuswb    xmm9,xmm1
-  movdqa      xmm0,xmm7
-  movdqa      xmm7,[rsp+0A0h]
-  pandn       xmm0,[rsp+0F0h]
-  movdqa      xmm1,xmm3
-  por         xmm6,xmm0
-  movdqa      xmm0,[rsp+10h]
-  paddw       xmm0,xmm4
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm12
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  pandn       xmm0,xmm12
-  movdqa      xmm1,xmm12
-  paddw       xmm1,[rsp+10h]
-  por         xmm2,xmm0
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+0B0h]
-  paddw       xmm1,xmm4
-  packuswb    xmm6,xmm2
-  movdqa      xmm2,xmm3
-  psllw       xmm1,1
-  por         xmm5,xmm0
-  movdqa      xmm0,[rsp+80h]
-  paddw       xmm0,xmm10
-  paddw       xmm1,xmm0
-  paddw       xmm11,xmm1
-  psraw       xmm11,3
-  movdqa      xmm1,xmm12
-  pand        xmm2,xmm11
-  paddw       xmm1,xmm12
-  movdqa      xmm11,[rsp+80h]
-  movdqa      xmm0, [rsp+10h]
-  por         xmm14,[rsp+0E0h]
-  paddw       xmm0,xmm11
-  movdqa      xmm4,xmm15
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  paddw       xmm1,xmm7
-  psraw       xmm1,2
-  pandn       xmm3,xmm1
-  por         xmm2,xmm3
-  movdqa      xmm1,xmm13
-  movdqa      xmm3,[rsp+10h]
-  pandn       xmm0,xmm3
-  pand        xmm1,xmm2
-  movdqa      xmm2,xmm11
-  paddw       xmm2,[rsp]
-  por         xmm1,xmm0
-  movdqa      xmm0,[rsp+0D0h]
-  por         xmm0,xmm8
-  paddw       xmm2,xmm3
-  packuswb    xmm5,xmm1
-  movdqa      xmm8,[rsp+40h]
-  movdqa      xmm1,[rsp+50h]
-  movdqa      xmm3,xmm8
-  pand        xmm4,xmm0
-  psllw       xmm2,1
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+90h]
-  por         xmm4,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm10
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,[rsp]
-  movdqa      xmm2,xmm11
-  paddw       xmm0,xmm12
-  movdqa      xmm12,[rsp]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm0
-  psraw       xmm1,3
-  movdqa      xmm0,xmm8
-  pand        xmm3,xmm1
-  paddw       xmm2,xmm7
-  movdqa      xmm1,xmm13
-  psraw       xmm2,2
-  pandn       xmm0,xmm2
-  por         xmm3,xmm0
-  movdqa      xmm2,[rsp+50h]
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm3
-  paddw       xmm2,xmm11
-  movdqa      xmm3,xmm15
-  por         xmm1,xmm0
-  pand        xmm3,xmm14
-  movdqa      xmm14,[rsp+10h]
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+30h]
-  packuswb    xmm4,xmm1
-  movdqa      xmm1,xmm8
-  por         xmm3,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm14
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm8
-  pandn       xmm0,xmm11
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm11
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm15
-  por         xmm2,xmm0
-  packuswb    xmm3,xmm2
-  movdqa      xmm0,[rsp+100h]
-  por         xmm0,[rsp+120h]
-  pand        xmm1,xmm0
-  movdqa      xmm2,[rcx+rbp]
-  movdqa      xmm7,[rsp+50h]
-  pandn       xmm15,[rsp+60h]
-  lea         r11,[rsp+1D8h]
-  pxor        xmm0,xmm0
-  por         xmm1,xmm15
-  movaps      xmm15,[r11-0A8h]
-  movdqa      [rdi],xmm9
-  movaps      xmm9,[r11-48h]
-  punpckhbw   xmm2,xmm0
-  psllw       xmm2,1
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm7
-  movdqa      [rbx],xmm6
-  movaps      xmm6,[r11-18h]
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm11
-  movaps      xmm11,[r11-68h]
-  paddw       xmm2,xmm12
-  movaps      xmm12,[r11-78h]
-  paddw       xmm2,xmm14
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  movaps      xmm10,[r11-58h]
-  movaps      xmm14,[r11-98h]
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm8
-  pandn       xmm8,xmm7
-  pandn       xmm13,xmm7
-  por         xmm2,xmm8
-  movaps      xmm7,[r11-28h]
-  movaps      xmm8,[r11-38h]
-  movdqa      [r8],xmm5
-  pand        xmm0,xmm2
-  por         xmm0,xmm13
-  packuswb    xmm1,xmm0
-  movaps      xmm13,[r11-88h]
-  movdqa      [rbp],xmm4
-  movdqa      [r10+rbp],xmm3
-  movdqa      [rsi+rbp],xmm1
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    mov         r8,   rdx
+    mov         r9,   rcx
+    mov         rcx,  rdi
+    mov         rdx,  rsi
+    sub         rsp,1D8h
+    movaps      [rax-38h],xmm6
+    movaps      [rax-48h],xmm7
+    movaps      [rax-58h],xmm8
+    pxor        xmm1,xmm1
+    movsxd      r10,edx
+    mov         rbp,rcx
+    mov         r11d,r8d
+    mov         rdx,rcx
+    mov         rdi,rbp
+    mov         rbx,rbp
+    movdqa      xmm5,[rbp]
+    movaps      [rax-68h],xmm9
+    movaps      [rax-78h],xmm10
+    punpcklbw   xmm5,xmm1
+    movaps      [rax-88h],xmm11
+    movaps      [rax-98h],xmm12
+    movaps      [rax-0A8h],xmm13
+    movaps      [rax-0B8h],xmm14
+    movdqa      xmm14,[r10+rbp]
+    movaps      [rax-0C8h],xmm15
+    lea         eax,[r10*4]
+    movsxd      r8,eax
+    lea         eax,[r10+r10*2]
+    movsxd      rcx,eax
+    lea         eax,[r10+r10]
+    sub         rdx,r8
+    punpcklbw   xmm14,xmm1
+    movdqa      [rsp+90h],xmm5
+    movdqa      [rsp+30h],xmm14
+    movsxd      rsi,eax
+    movsx       eax,r11w
+    sub         rdi,rcx
+    sub         rbx,rsi
+    mov         r8,rbp
+    sub         r8,r10
+    movd        xmm0,eax
+    movsx       eax,r9w
+    movdqa      xmm12,[rdi]
+    movdqa      xmm6, [rsi+rbp]
+    movdqa      xmm13,[rbx]
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm6,xmm1
+    movdqa      xmm8,[r8]
+    movd        xmm0,eax
+    movdqa      xmm10,xmm11
+    mov         eax,2
+    punpcklbw   xmm8,xmm1
+    punpcklbw   xmm12,xmm1
+    cwde
+    punpcklwd   xmm0,xmm0
+    psraw       xmm10,2
+    movdqa      xmm1,xmm8
+    movdqa      [rsp+0F0h],xmm13
+    movdqa      [rsp+0B0h],xmm8
+    pshufd      xmm7,xmm0,0
+    psubw       xmm1,xmm13
+    movdqa      xmm0,xmm5
+    movdqa      xmm4,xmm7
+    movdqa      xmm2,xmm7
+    psubw       xmm0,xmm8
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm5
+    movdqa      [rsp+40h],xmm7
+    movdqa      [rsp+60h],xmm6
+    pcmpgtw     xmm4,xmm0
+    psubw       xmm1,xmm14
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm4,xmm2
+    movdqa      xmm0,xmm11
+    pcmpgtw     xmm0,xmm3
+    pand        xmm4,xmm0
+    movd        xmm0,eax
+    movdqa      [rsp+20h],xmm4
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm2,xmm0,0
+    paddw       xmm10,xmm2
+    movdqa      [rsp+0A0h],xmm2
+    movdqa      xmm15,xmm7
+    pxor        xmm4,xmm4
+    movdqa      xmm0,xmm8
+    psubw       xmm0,xmm12
+    mov         eax,4
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm10
+    cwde
+    pcmpgtw     xmm15,xmm0
+    pcmpgtw     xmm1,xmm3
+    movdqa      xmm3,xmm7
+    movdqa      xmm7,[rdx]
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm6
+    pand        xmm15,xmm1
+    punpcklbw   xmm7,xmm4
+    movdqa      xmm9,xmm15
+    pabsw       xmm0,xmm0
+    psllw       xmm7,1
+    pandn       xmm9,xmm12
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm7,xmm12
+    movd        xmm0,eax
+    pand        xmm3,xmm1
+    paddw       xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    paddw       xmm7,xmm12
+    pshufd      xmm1,xmm0,0
+    paddw       xmm7,xmm13
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm6
+    paddw       xmm7,xmm8
+    movdqa      [rsp+70h],xmm1
+    paddw       xmm7,xmm5
+    movdqa      [rsp+120h],xmm0
+    movdqa      xmm0,[rcx+rbp]
+    punpcklbw   xmm0,xmm4
+    paddw       xmm7,xmm1
+    movdqa      xmm4,xmm15
+    psllw       xmm0,1
+    psraw       xmm7,3
+    paddw       xmm0,xmm6
+    pand        xmm7,xmm15
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm6
+    paddw       xmm0,xmm14
+    movdqa      xmm6,xmm15
+    paddw       xmm0,xmm5
+    pandn       xmm6,xmm13
+    paddw       xmm0,xmm8
+    paddw       xmm0,xmm1
+    psraw       xmm0,3
+    movdqa      xmm1,xmm12
+    paddw       xmm1,xmm13
+    pand        xmm0,xmm3
+    movdqa      [rsp+100h],xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,xmm5
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm3
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pandn       xmm0,xmm14
+    pand        xmm4,xmm1
+    movdqa      [rsp+0E0h],xmm0
+    movdqa      xmm0,xmm5
+    paddw       xmm0,xmm8
+    movdqa      xmm1,[rsp+60h]
+    paddw       xmm1,xmm14
+    movdqa      xmm14,xmm3
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm8
+    paddw       xmm0,[rsp+30h]
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    pand        xmm14,xmm1
+    movdqa      xmm1,xmm13
+    paddw       xmm1,xmm13
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm2
+    psraw       xmm1,2
+    movdqa      xmm0,[rsp+30h]
+    movdqa      xmm2,xmm13
+    movdqa      xmm5,xmm15
+    paddw       xmm0,[rsp+70h]
+    pandn       xmm5,xmm1
+    paddw       xmm2,xmm8
+    movdqa      xmm8,[rsp+90h]
+    movdqa      xmm1,xmm12
+    paddw       xmm2,xmm8
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,xmm8
+    movdqa      xmm8,xmm3
+    movdqa      xmm2,[rsp+30h]
+    paddw       xmm0,xmm13
+    psraw       xmm1,3
+    pand        xmm15,xmm1
+    movdqa      xmm1,xmm2
+    paddw       xmm1,xmm2
+    paddw       xmm2,[rsp+90h]
+    paddw       xmm2,[rsp+0B0h]
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    movdqa      xmm13,[r8]
+    paddw       xmm0, [rsp+70h]
+    paddw       xmm1, [rsp+0A0h]
+    psllw       xmm2,1
+    paddw       xmm2,xmm0
+    psraw       xmm1,2
+    movdqa      xmm0, [rdi]
+    pandn       xmm8,xmm1
+    movdqa      xmm1, [rsp+60h]
+    paddw       xmm1,xmm2
+    movdqa      xmm2, [rbx]
+    psraw       xmm1,3
+    pand        xmm3,xmm1
+    movdqa      xmm1, [rbp]
+    movdqa      [rsp+0D0h],xmm3
+    pxor        xmm3,xmm3
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm1,xmm3
+    punpckhbw   xmm13,xmm3
+    movdqa      [rsp+0C0h],xmm0
+    movdqa      xmm0,[r10+rbp]
+    movdqa      [rsp],xmm1
+    punpckhbw   xmm0,xmm3
+    punpckhbw   xmm2,xmm3
+    movdqa      [rsp+80h],xmm0
+    movdqa      xmm0,[rsi+rbp]
+    movdqa      [rsp+10h],xmm13
+    punpckhbw   xmm0,xmm3
+    movdqa      [rsp+50h],xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm1,xmm13
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm2
+    pabsw       xmm3,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,[rsp]
+    movdqa      xmm13,[rsp+40h]
+    movdqa      [rsp+110h],xmm2
+    psubw       xmm1, [rsp+80h]
+    pcmpgtw     xmm13,xmm0
+    pcmpgtw     xmm11,xmm3
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm10,xmm3
+    movdqa      xmm1, [rsp+40h]
+    movdqa      xmm2,xmm1
+    movdqa      xmm3,xmm1
+    pcmpgtw     xmm2,xmm0
+    movdqa      xmm0, [rsp+10h]
+    pand        xmm13,xmm2
+    pand        xmm13,xmm11
+    movdqa      xmm11,[rsp+0C0h]
+    psubw       xmm0,xmm11
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm3,xmm0
+    pand        xmm3,xmm10
+    movdqa      xmm0,[rsp]
+    psubw       xmm0,[rsp+50h]
+    movdqa      xmm2,[rdx]
+    pabsw       xmm0,xmm0
+    por         xmm7,xmm9
+    movdqa      xmm9,[rsp+20h]
+    pcmpgtw     xmm1,xmm0
+    pand        xmm9,xmm7
+    movdqa      xmm7,[rsp+20h]
+    movdqa      xmm0,xmm7
+    pandn       xmm0,xmm12
+    movdqa      xmm12,[rsp+110h]
+    pand        xmm1,xmm10
+    movdqa      xmm10,[rsp+70h]
+    movdqa      [rsp+40h],xmm1
+    movdqa      xmm1,xmm13
+    por         xmm9,xmm0
+    pxor        xmm0,xmm0
+    por         xmm4,xmm6
+    movdqa      xmm6,xmm7
+    punpckhbw   xmm2,xmm0
+    por         xmm15,xmm5
+    movdqa      xmm5,[rsp+20h]
+    movdqa      xmm0,xmm3
+    psllw       xmm2,1
+    pandn       xmm0,xmm11
+    pand        xmm6,xmm4
+    movdqa      xmm4,[rsp]
+    paddw       xmm2,xmm11
+    pand        xmm5,xmm15
+    movdqa      xmm15,[rsp+20h]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm12
+    paddw       xmm2,[rsp+10h]
+    paddw       xmm2,[rsp]
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    pand        xmm2,xmm3
+    por         xmm2,xmm0
+    pand        xmm1,xmm2
+    movdqa      xmm0,xmm13
+    movdqa      xmm2,xmm11
+    pandn       xmm0,xmm11
+    paddw       xmm2,xmm12
+    por         xmm1,xmm0
+    packuswb    xmm9,xmm1
+    movdqa      xmm0,xmm7
+    movdqa      xmm7,[rsp+0A0h]
+    pandn       xmm0,[rsp+0F0h]
+    movdqa      xmm1,xmm3
+    por         xmm6,xmm0
+    movdqa      xmm0,[rsp+10h]
+    paddw       xmm0,xmm4
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm3
+    pandn       xmm0,xmm12
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    pandn       xmm0,xmm12
+    movdqa      xmm1,xmm12
+    paddw       xmm1,[rsp+10h]
+    por         xmm2,xmm0
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+0B0h]
+    paddw       xmm1,xmm4
+    packuswb    xmm6,xmm2
+    movdqa      xmm2,xmm3
+    psllw       xmm1,1
+    por         xmm5,xmm0
+    movdqa      xmm0,[rsp+80h]
+    paddw       xmm0,xmm10
+    paddw       xmm1,xmm0
+    paddw       xmm11,xmm1
+    psraw       xmm11,3
+    movdqa      xmm1,xmm12
+    pand        xmm2,xmm11
+    paddw       xmm1,xmm12
+    movdqa      xmm11,[rsp+80h]
+    movdqa      xmm0, [rsp+10h]
+    por         xmm14,[rsp+0E0h]
+    paddw       xmm0,xmm11
+    movdqa      xmm4,xmm15
+    paddw       xmm1,xmm0
+    movdqa      xmm0,xmm13
+    paddw       xmm1,xmm7
+    psraw       xmm1,2
+    pandn       xmm3,xmm1
+    por         xmm2,xmm3
+    movdqa      xmm1,xmm13
+    movdqa      xmm3,[rsp+10h]
+    pandn       xmm0,xmm3
+    pand        xmm1,xmm2
+    movdqa      xmm2,xmm11
+    paddw       xmm2,[rsp]
+    por         xmm1,xmm0
+    movdqa      xmm0,[rsp+0D0h]
+    por         xmm0,xmm8
+    paddw       xmm2,xmm3
+    packuswb    xmm5,xmm1
+    movdqa      xmm8,[rsp+40h]
+    movdqa      xmm1,[rsp+50h]
+    movdqa      xmm3,xmm8
+    pand        xmm4,xmm0
+    psllw       xmm2,1
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+90h]
+    por         xmm4,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm10
+    paddw       xmm2,xmm0
+    paddw       xmm1,xmm2
+    movdqa      xmm0,[rsp]
+    movdqa      xmm2,xmm11
+    paddw       xmm0,xmm12
+    movdqa      xmm12,[rsp]
+    paddw       xmm2,xmm11
+    paddw       xmm2,xmm0
+    psraw       xmm1,3
+    movdqa      xmm0,xmm8
+    pand        xmm3,xmm1
+    paddw       xmm2,xmm7
+    movdqa      xmm1,xmm13
+    psraw       xmm2,2
+    pandn       xmm0,xmm2
+    por         xmm3,xmm0
+    movdqa      xmm2,[rsp+50h]
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm3
+    paddw       xmm2,xmm11
+    movdqa      xmm3,xmm15
+    por         xmm1,xmm0
+    pand        xmm3,xmm14
+    movdqa      xmm14,[rsp+10h]
+    movdqa      xmm0,xmm15
+    pandn       xmm0,[rsp+30h]
+    packuswb    xmm4,xmm1
+    movdqa      xmm1,xmm8
+    por         xmm3,xmm0
+    movdqa      xmm0,xmm12
+    paddw       xmm0,xmm14
+    paddw       xmm2,xmm0
+    paddw       xmm2,xmm7
+    movdqa      xmm0,xmm8
+    pandn       xmm0,xmm11
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    movdqa      xmm2,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm11
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm15
+    por         xmm2,xmm0
+    packuswb    xmm3,xmm2
+    movdqa      xmm0,[rsp+100h]
+    por         xmm0,[rsp+120h]
+    pand        xmm1,xmm0
+    movdqa      xmm2,[rcx+rbp]
+    movdqa      xmm7,[rsp+50h]
+    pandn       xmm15,[rsp+60h]
+    lea         r11,[rsp+1D8h]
+    pxor        xmm0,xmm0
+    por         xmm1,xmm15
+    movaps      xmm15,[r11-0A8h]
+    movdqa      [rdi],xmm9
+    movaps      xmm9,[r11-48h]
+    punpckhbw   xmm2,xmm0
+    psllw       xmm2,1
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm7
+    movdqa      [rbx],xmm6
+    movaps      xmm6,[r11-18h]
+    paddw       xmm2,xmm7
+    paddw       xmm2,xmm11
+    movaps      xmm11,[r11-68h]
+    paddw       xmm2,xmm12
+    movaps      xmm12,[r11-78h]
+    paddw       xmm2,xmm14
+    paddw       xmm2,xmm10
+    psraw       xmm2,3
+    movaps      xmm10,[r11-58h]
+    movaps      xmm14,[r11-98h]
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm8
+    pandn       xmm8,xmm7
+    pandn       xmm13,xmm7
+    por         xmm2,xmm8
+    movaps      xmm7,[r11-28h]
+    movaps      xmm8,[r11-38h]
+    movdqa      [r8],xmm5
+    pand        xmm0,xmm2
+    por         xmm0,xmm13
+    packuswb    xmm1,xmm0
+    movaps      xmm13,[r11-88h]
+    movdqa      [rbp],xmm4
+    movdqa      [r10+rbp],xmm3
+    movdqa      [rsi+rbp],xmm1
+    mov         rsp,r11
+    pop         rbp
+    pop         rbx
+    ret
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  mov         r10,  rdx
-  mov         r11,  rcx
-  mov         rcx,  rdi
-  mov         rdx,  rsi
-  mov         rsi,  r10
-  mov         r10,  r9
-  mov         rbp,  r8
-  mov         r8,   rsi
-  mov         r9,   r11
-  sub         rsp,0C8h
-  pxor        xmm1,xmm1
-  mov         rbx,rcx
-  movsxd      r11,r8d
-  movsx       ecx,byte [r10]
-  movsx       r8d,byte [r10+2]
-  mov         rdi,rdx
-  movq        xmm2,[rbx]
-  movq        xmm9,[r11+rbx]
-  movsx       edx,byte [r10+1]
-  mov         word [rsp+2],cx
-  mov         word [rsp],cx
-  movsx       eax,byte [r10+3]
-  mov         word [rsp+6],dx
-  mov         word [rsp+4],dx
-  movdqa      xmm11,xmm1
-  mov         word [rsp+0Eh],ax
-  mov         word [rsp+0Ch],ax
-  lea         eax,[r11+r11]
-  movsxd      rcx,eax
-  mov         rax,rbx
-  mov         rdx,rdi
-  sub         rax,rcx
-  mov         word [rsp+0Ah],r8w
-  mov         word [rsp+8],r8w
-  movdqa      xmm6,[rsp]
-  movdqa      xmm7,xmm6
-  movq        xmm13, [rax]
-  mov         rax,rdi
-  sub         rax,rcx
-  mov         rcx,rbx
-  pcmpgtw     xmm7,xmm1
-  psubw       xmm11,xmm6
-  sub         rcx,r11
-  sub         rdx,r11
-  movq        xmm0,[rax]
-  movsx       eax,r9w
-  movq        xmm15,[rcx]
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rdx]
-  movdqa      xmm4,xmm13
-  punpcklqdq  xmm15,xmm0
-  movq        xmm0, [rdi]
-  punpcklbw   xmm4,xmm1
-  movdqa      xmm12,xmm15
-  punpcklqdq  xmm2,xmm0
-  movq        xmm0, [r11+rdi]
-  punpcklbw   xmm12,xmm1
-  movdqa      xmm14,xmm2
-  punpcklqdq  xmm9,xmm0
-  punpckhbw   xmm2,xmm1
-  punpcklbw   xmm14,xmm1
-  movd        xmm0,eax
-  mov         eax, ebp ; iBeta
-  punpckhbw   xmm13,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm9
-  movdqa      [rsp+10h],xmm2
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm9,xmm1
-  punpcklbw   xmm3,xmm1
-  movdqa      xmm1,xmm14
-  pshufd      xmm10,xmm0,0
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm8,xmm0,0
-  movd        xmm0,eax
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  psubw       xmm1,xmm12
-  movdqa      xmm2,xmm10
-  lea         r11,[rsp+0C8h]
-  psllw       xmm1,2
-  movdqa      xmm0,xmm4
-  psubw       xmm4,xmm12
-  psubw       xmm0,xmm3
-  psubw       xmm3,xmm14
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm11
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm12
-  psubw       xmm0,xmm14
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  movdqa      xmm3,[rsp]
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm9
-  psubw       xmm13,xmm15
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  paddw       xmm12,xmm6
-  psubw       xmm14,xmm6
-  movdqa      xmm2,[rsp+10h]
-  movaps      xmm6,[r11-18h]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm15
-  psubw       xmm9,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm15
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  pmaxsw      xmm11,xmm1
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm10,xmm0
-  pabsw       xmm0,xmm13
-  pminsw      xmm3,xmm11
-  movaps      xmm11,[r11-68h]
-  movaps      xmm13,[rsp+40h]
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm9
-  movaps      xmm9, [r11-48h]
-  pand        xmm10,xmm1
-  pcmpgtw     xmm8,xmm0
-  pand        xmm10,xmm8
-  pand        xmm10,xmm7
-  movaps      xmm8,[r11-38h]
-  movaps      xmm7,[r11-28h]
-  pand        xmm3,xmm10
-  paddw       xmm15,xmm3
-  psubw       xmm2,xmm3
-  movaps      xmm10,[r11-58h]
-  packuswb    xmm12,xmm15
-  movaps      xmm15,[rsp+20h]
-  packuswb    xmm14,xmm2
-  movq        [rcx],xmm12
-  movq        [rbx],xmm14
-  psrldq      xmm12,8
-  psrldq      xmm14,8
-  movq        [rdx],xmm12
-  movaps      xmm12,[r11-78h]
-  movq        [rdi],xmm14
-  movaps      xmm14,[rsp+30h]
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    mov         r10,  rdx
+    mov         r11,  rcx
+    mov         rcx,  rdi
+    mov         rdx,  rsi
+    mov         rsi,  r10
+    mov         r10,  r9
+    mov         rbp,  r8
+    mov         r8,   rsi
+    mov         r9,   r11
+    sub         rsp,0C8h
+    pxor        xmm1,xmm1
+    mov         rbx,rcx
+    movsxd      r11,r8d
+    movsx       ecx,byte [r10]
+    movsx       r8d,byte [r10+2]
+    mov         rdi,rdx
+    movq        xmm2,[rbx]
+    movq        xmm9,[r11+rbx]
+    movsx       edx,byte [r10+1]
+    mov         word [rsp+2],cx
+    mov         word [rsp],cx
+    movsx       eax,byte [r10+3]
+    mov         word [rsp+6],dx
+    mov         word [rsp+4],dx
+    movdqa      xmm11,xmm1
+    mov         word [rsp+0Eh],ax
+    mov         word [rsp+0Ch],ax
+    lea         eax,[r11+r11]
+    movsxd      rcx,eax
+    mov         rax,rbx
+    mov         rdx,rdi
+    sub         rax,rcx
+    mov         word [rsp+0Ah],r8w
+    mov         word [rsp+8],r8w
+    movdqa      xmm6,[rsp]
+    movdqa      xmm7,xmm6
+    movq        xmm13, [rax]
+    mov         rax,rdi
+    sub         rax,rcx
+    mov         rcx,rbx
+    pcmpgtw     xmm7,xmm1
+    psubw       xmm11,xmm6
+    sub         rcx,r11
+    sub         rdx,r11
+    movq        xmm0,[rax]
+    movsx       eax,r9w
+    movq        xmm15,[rcx]
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rdx]
+    movdqa      xmm4,xmm13
+    punpcklqdq  xmm15,xmm0
+    movq        xmm0, [rdi]
+    punpcklbw   xmm4,xmm1
+    movdqa      xmm12,xmm15
+    punpcklqdq  xmm2,xmm0
+    movq        xmm0, [r11+rdi]
+    punpcklbw   xmm12,xmm1
+    movdqa      xmm14,xmm2
+    punpcklqdq  xmm9,xmm0
+    punpckhbw   xmm2,xmm1
+    punpcklbw   xmm14,xmm1
+    movd        xmm0,eax
+    mov         eax, ebp ; iBeta
+    punpckhbw   xmm13,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm9
+    movdqa      [rsp+10h],xmm2
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm9,xmm1
+    punpcklbw   xmm3,xmm1
+    movdqa      xmm1,xmm14
+    pshufd      xmm10,xmm0,0
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm8,xmm0,0
+    movd        xmm0,eax
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    psubw       xmm1,xmm12
+    movdqa      xmm2,xmm10
+    lea         r11,[rsp+0C8h]
+    psllw       xmm1,2
+    movdqa      xmm0,xmm4
+    psubw       xmm4,xmm12
+    psubw       xmm0,xmm3
+    psubw       xmm3,xmm14
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm11
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm12
+    psubw       xmm0,xmm14
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    movdqa      xmm3,[rsp]
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm13
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm9
+    psubw       xmm13,xmm15
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    paddw       xmm12,xmm6
+    psubw       xmm14,xmm6
+    movdqa      xmm2,[rsp+10h]
+    movaps      xmm6,[r11-18h]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm15
+    psubw       xmm9,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm15
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    pmaxsw      xmm11,xmm1
+    pabsw       xmm0,xmm0
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm10,xmm0
+    pabsw       xmm0,xmm13
+    pminsw      xmm3,xmm11
+    movaps      xmm11,[r11-68h]
+    movaps      xmm13,[rsp+40h]
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm9
+    movaps      xmm9, [r11-48h]
+    pand        xmm10,xmm1
+    pcmpgtw     xmm8,xmm0
+    pand        xmm10,xmm8
+    pand        xmm10,xmm7
+    movaps      xmm8,[r11-38h]
+    movaps      xmm7,[r11-28h]
+    pand        xmm3,xmm10
+    paddw       xmm15,xmm3
+    psubw       xmm2,xmm3
+    movaps      xmm10,[r11-58h]
+    packuswb    xmm12,xmm15
+    movaps      xmm15,[rsp+20h]
+    packuswb    xmm14,xmm2
+    movq        [rcx],xmm12
+    movq        [rbx],xmm14
+    psrldq      xmm12,8
+    psrldq      xmm14,8
+    movq        [rdx],xmm12
+    movaps      xmm12,[r11-78h]
+    movq        [rdi],xmm14
+    movaps      xmm14,[rsp+30h]
+    mov         rsp,r11
+    pop         rbp
+    pop         rbx
+    ret
 
 WELS_EXTERN DeblockChromaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
+    mov         rax,rsp
+    push        rbx
+    push        rbp
 
-  mov         rbp, r8
-  mov         r8, rdx
-  mov         r9, rcx
-  mov         rcx, rdi
-  mov         rdx, rsi
+    mov         rbp, r8
+    mov         r8, rdx
+    mov         r9, rcx
+    mov         rcx, rdi
+    mov         rdx, rsi
 
-  sub         rsp,90h
-  pxor        xmm1,xmm1
-  mov         r11,rcx
-  mov         rbx,rdx
-  mov         r10d,r9d
-  movq        xmm13,[r11]
-  lea         eax,[r8+r8]
-  movsxd      r9,eax
-  mov         rax,rcx
-  sub         rax,r9
-  movq        xmm14,[rax]
-  mov         rax,rdx
-  sub         rax,r9
-  movq        xmm0,[rax]
-  movsxd      rax,r8d
-  sub         rcx,rax
-  sub         rdx,rax
-  movq        xmm12,[rax+r11]
-  movq        xmm10,[rcx]
-  punpcklqdq  xmm14,xmm0
-  movdqa      xmm8,xmm14
-  movq        xmm0,[rdx]
-  punpcklbw   xmm8,xmm1
-  punpckhbw   xmm14,xmm1
-  punpcklqdq  xmm10,xmm0
-  movq        xmm0,[rbx]
-  movdqa      xmm5,xmm10
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rax+rbx]
-  punpcklbw   xmm5,xmm1
-  movsx       eax,r10w
-  movdqa      xmm9,xmm13
-  punpcklqdq  xmm12,xmm0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm10,xmm1
-  movd        xmm0,eax
-  mov         eax, ebp   ; iBeta
-  punpckhbw   xmm13,xmm1
-  movdqa      xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm12,xmm1
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm7,xmm1
-  movd        xmm0,eax
-  movdqa      xmm1,xmm8
-  psubw       xmm1,xmm5
-  punpcklwd   xmm0,xmm0
-  movdqa      xmm6,xmm11
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm9
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm10
-  movdqa      xmm1,xmm14
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm10
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm11,xmm0
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm11,xmm2
-  movdqa      xmm0,xmm12
-  movdqa      xmm4,xmm6
-  movdqa      xmm1,xmm8
-  mov         eax,2
-  cwde
-  paddw       xmm1,xmm8
-  psubw       xmm0,xmm13
-  paddw       xmm1,xmm5
-  pabsw       xmm0,xmm0
-  movdqa      xmm2,xmm14
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm14
-  movd        xmm0,eax
-  pand        xmm11,xmm3
-  paddw       xmm7,xmm7
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  paddw       xmm2,xmm12
-  paddw       xmm12,xmm12
-  pshufd      xmm3,xmm0,0
-  paddw       xmm7,xmm9
-  paddw       xmm12,xmm13
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm5
-  paddw       xmm7,xmm8
-  psraw       xmm1,2
-  paddw       xmm12,xmm14
-  paddw       xmm7,xmm3
-  ;movaps      xmm14,[rsp]
-  pand        xmm4,xmm1
-  paddw       xmm12,xmm3
-  psraw       xmm7,2
-  movdqa      xmm1,xmm11
-  por         xmm4,xmm0
-  psraw       xmm12,2
-  paddw       xmm2,xmm3
-  movdqa      xmm0,xmm11
-  pandn       xmm0,xmm10
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  packuswb    xmm4,xmm1
-  movdqa      xmm0,xmm11
-  movdqa      xmm1,xmm6
-  pand        xmm1,xmm7
-  movq        [rcx],xmm4
-  pandn       xmm6,xmm9
-  pandn       xmm11,xmm13
-  pand        xmm0,xmm12
-  por         xmm1,xmm6
-  por         xmm0,xmm11
-  psrldq      xmm4,8
-  packuswb    xmm1,xmm0
-  movq        [r11],xmm1
-  psrldq      xmm1,8
-  movq        [rdx],xmm4
-  lea         r11,[rsp+90h]
-  movq        [rbx],xmm1
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
+    sub         rsp,90h
+    pxor        xmm1,xmm1
+    mov         r11,rcx
+    mov         rbx,rdx
+    mov         r10d,r9d
+    movq        xmm13,[r11]
+    lea         eax,[r8+r8]
+    movsxd      r9,eax
+    mov         rax,rcx
+    sub         rax,r9
+    movq        xmm14,[rax]
+    mov         rax,rdx
+    sub         rax,r9
+    movq        xmm0,[rax]
+    movsxd      rax,r8d
+    sub         rcx,rax
+    sub         rdx,rax
+    movq        xmm12,[rax+r11]
+    movq        xmm10,[rcx]
+    punpcklqdq  xmm14,xmm0
+    movdqa      xmm8,xmm14
+    movq        xmm0,[rdx]
+    punpcklbw   xmm8,xmm1
+    punpckhbw   xmm14,xmm1
+    punpcklqdq  xmm10,xmm0
+    movq        xmm0,[rbx]
+    movdqa      xmm5,xmm10
+    punpcklqdq  xmm13,xmm0
+    movq        xmm0, [rax+rbx]
+    punpcklbw   xmm5,xmm1
+    movsx       eax,r10w
+    movdqa      xmm9,xmm13
+    punpcklqdq  xmm12,xmm0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm10,xmm1
+    movd        xmm0,eax
+    mov         eax, ebp   ; iBeta
+    punpckhbw   xmm13,xmm1
+    movdqa      xmm7,xmm12
+    punpcklwd   xmm0,xmm0
+    punpckhbw   xmm12,xmm1
+    pshufd      xmm11,xmm0,0
+    punpcklbw   xmm7,xmm1
+    movd        xmm0,eax
+    movdqa      xmm1,xmm8
+    psubw       xmm1,xmm5
+    punpcklwd   xmm0,xmm0
+    movdqa      xmm6,xmm11
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm5
+    psubw       xmm0,xmm9
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm10
+    movdqa      xmm1,xmm14
+    psubw       xmm0,xmm13
+    psubw       xmm1,xmm10
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm11,xmm0
+    pabsw       xmm0,xmm1
+    pcmpgtw     xmm2,xmm0
+    pand        xmm11,xmm2
+    movdqa      xmm0,xmm12
+    movdqa      xmm4,xmm6
+    movdqa      xmm1,xmm8
+    mov         eax,2
+    cwde
+    paddw       xmm1,xmm8
+    psubw       xmm0,xmm13
+    paddw       xmm1,xmm5
+    pabsw       xmm0,xmm0
+    movdqa      xmm2,xmm14
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm14
+    movd        xmm0,eax
+    pand        xmm11,xmm3
+    paddw       xmm7,xmm7
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    paddw       xmm2,xmm12
+    paddw       xmm12,xmm12
+    pshufd      xmm3,xmm0,0
+    paddw       xmm7,xmm9
+    paddw       xmm12,xmm13
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm5
+    paddw       xmm7,xmm8
+    psraw       xmm1,2
+    paddw       xmm12,xmm14
+    paddw       xmm7,xmm3
+    ;movaps      xmm14,[rsp]
+    pand        xmm4,xmm1
+    paddw       xmm12,xmm3
+    psraw       xmm7,2
+    movdqa      xmm1,xmm11
+    por         xmm4,xmm0
+    psraw       xmm12,2
+    paddw       xmm2,xmm3
+    movdqa      xmm0,xmm11
+    pandn       xmm0,xmm10
+    psraw       xmm2,2
+    pand        xmm1,xmm2
+    por         xmm1,xmm0
+    packuswb    xmm4,xmm1
+    movdqa      xmm0,xmm11
+    movdqa      xmm1,xmm6
+    pand        xmm1,xmm7
+    movq        [rcx],xmm4
+    pandn       xmm6,xmm9
+    pandn       xmm11,xmm13
+    pand        xmm0,xmm12
+    por         xmm1,xmm6
+    por         xmm0,xmm11
+    psrldq      xmm4,8
+    packuswb    xmm1,xmm0
+    movq        [r11],xmm1
+    psrldq      xmm1,8
+    movq        [rdx],xmm4
+    lea         r11,[rsp+90h]
+    movq        [rbx],xmm1
+    mov         rsp,r11
+    pop         rbp
+    pop         rbx
+    ret
 
 WELS_EXTERN DeblockChromaEq4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        r12
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        r12
 
-  mov         rbp,   r8
-  mov         r8,    rdx
-  mov         r9,    rcx
-  mov         rcx,   rdi
-  mov         rdx,   rsi
-  mov         rdi,   rdx
+    mov         rbp,   r8
+    mov         r8,    rdx
+    mov         r9,    rcx
+    mov         rcx,   rdi
+    mov         rdx,   rsi
+    mov         rdi,   rdx
 
-  sub         rsp,140h
-  lea         eax,[r8*4]
-  movsxd      r10,eax
-  mov         eax,[rcx-2]
-  mov         [rsp+10h],eax
-  lea         rbx,[r10+rdx-2]
-  lea         r11,[r10+rcx-2]
+    sub         rsp,140h
+    lea         eax,[r8*4]
+    movsxd      r10,eax
+    mov         eax,[rcx-2]
+    mov         [rsp+10h],eax
+    lea         rbx,[r10+rdx-2]
+    lea         r11,[r10+rcx-2]
 
-  movdqa      xmm5,[rsp+10h]
-  movsxd      r10,r8d
-  mov         eax,[r10+rcx-2]
-  lea         rdx,[r10+r10*2]
-  mov         [rsp+20h],eax
-  mov         eax,[rcx+r10*2-2]
-  mov         [rsp+30h],eax
-  mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h]
-  mov         [rsp+40h],eax
-  mov         eax, [rdi-2]
-  movdqa      xmm4,[rsp+30h]
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rdi-2]
-  movdqa      xmm3,[rsp+40h]
-  mov         [rsp+60h],eax
-  mov         eax,[rdi+r10*2-2]
-  punpckldq   xmm5,[rsp+50h]
-  mov         [rsp+70h],eax
-  mov         eax, [rdx+rdi-2]
-  punpckldq   xmm2, [rsp+60h]
-  mov          [rsp+80h],eax
-  mov         eax,[r11]
-  punpckldq   xmm4, [rsp+70h]
-  mov         [rsp+50h],eax
-  mov         eax,[rbx]
-  punpckldq   xmm3,[rsp+80h]
-  mov         [rsp+60h],eax
-  mov         eax,[r10+r11]
-  movdqa      xmm0, [rsp+50h]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm0,[rsp+50h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+60h],eax
-  mov         eax,[r11+r10*2]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[rbx+r10*2]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  mov         eax, [rdx+r11]
-  movdqa      xmm15,xmm1
-  punpckldq   xmm0,[rsp+60h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax, [rdx+rbx]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm15,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm12,xmm15
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm12,xmm0
-  punpckhdq   xmm15,xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm11,xmm12
-  punpckldq   xmm0,xmm5
-  punpckhdq   xmm1,xmm5
-  punpcklqdq  xmm11,xmm0
-  punpckhqdq  xmm12,xmm0
-  movsx       eax,r9w
-  movdqa      xmm14,xmm15
-  punpcklqdq  xmm14,xmm1
-  punpckhqdq  xmm15,xmm1
-  pxor        xmm1,xmm1
-  movd        xmm0,eax
-  movdqa      xmm4,xmm12
-  movdqa      xmm8,xmm11
-  mov         eax, ebp ; iBeta
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm4,xmm1
-  punpckhbw   xmm12,xmm1
-  movdqa      xmm9,xmm14
-  movdqa      xmm7,xmm15
-  movdqa      xmm10,xmm15
-  pshufd      xmm13,xmm0,0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm14,xmm1
-  movdqa      xmm6,xmm13
-  movd        xmm0,eax
-  movdqa      [rsp],xmm11
-  mov         eax,2
-  cwde
-  punpckhbw   xmm11,xmm1
-  punpckhbw   xmm10,xmm1
-  punpcklbw   xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm8,xmm1
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm4
-  psubw       xmm0,xmm9
-  psubw       xmm1,xmm4
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm12
-  movdqa      xmm1,xmm11
-  psubw       xmm0,xmm14
-  psubw       xmm1,xmm12
-  movdqa      xmm5,xmm6
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm13,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm2,xmm0
-  paddw       xmm1,xmm8
-  movdqa      xmm0,xmm10
-  pand        xmm13,xmm2
-  psubw       xmm0,xmm14
-  paddw       xmm1,xmm4
-  movdqa      xmm2,xmm11
-  pabsw       xmm0,xmm0
-  paddw       xmm2,xmm11
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm12
-  movd        xmm0,eax
-  pand        xmm13,xmm3
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm4
-  paddw       xmm2,xmm3
-  psraw       xmm1,2
-  pand        xmm5,xmm1
-  por         xmm5,xmm0
-  paddw       xmm7,xmm7
-  paddw       xmm10,xmm10
-  psraw       xmm2,2
-  movdqa      xmm1,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm2
-  paddw       xmm7,xmm9
-  por         xmm1,xmm0
-  paddw       xmm10,xmm14
-  paddw       xmm7,xmm8
-  movdqa      xmm0,xmm13
-  packuswb    xmm5,xmm1
-  paddw       xmm7,xmm3
-  paddw       xmm10,xmm11
-  movdqa      xmm1,xmm6
-  paddw       xmm10,xmm3
-  pandn       xmm6,xmm9
-  psraw       xmm7,2
-  pand        xmm1,xmm7
-  psraw       xmm10,2
-  pandn       xmm13,xmm14
-  pand        xmm0,xmm10
-  por         xmm1,xmm6
-  movdqa      xmm6,[rsp]
-  movdqa      xmm4,xmm6
-  por         xmm0,xmm13
-  punpcklbw   xmm4,xmm5
-  punpckhbw   xmm6,xmm5
-  movdqa      xmm3,xmm4
-  packuswb    xmm1,xmm0
-  movdqa      xmm0,xmm1
-  punpckhbw   xmm1,xmm15
-  punpcklbw   xmm0,xmm15
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm6
-  movdqa      xmm2,xmm3
-  punpcklwd   xmm0,xmm1
-  punpckhwd   xmm6,xmm1
-  movdqa      xmm1,xmm4
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm6
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm6
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+10h],xmm0
-  movdqa      [rsp+60h],xmm2
-  movdqa      xmm0,xmm3
-  mov         eax,[rsp+10h]
-  mov         [rcx-2],eax
-  mov         eax,[rsp+60h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [r10+rcx-2],eax
-  movdqa      [rsp+20h],xmm0
-  mov         eax, [rsp+20h]
-  movdqa      [rsp+70h],xmm3
-  mov         [rcx+r10*2-2],eax
-  mov         eax,[rsp+70h]
-  mov         [rdx+rcx-2],eax
-  mov         eax,[rsp+18h]
-  mov         [r11],eax
-  mov         eax,[rsp+68h]
-  mov         [r10+r11],eax
-  mov         eax,[rsp+28h]
-  mov         [r11+r10*2],eax
-  mov         eax,[rsp+78h]
-  mov         [rdx+r11],eax
-  mov         eax,[rsp+14h]
-  mov         [rdi-2],eax
-  mov         eax,[rsp+64h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+24h]
-  mov         [rdi+r10*2-2],eax
-  mov         eax, [rsp+74h]
-  mov         [rdx+rdi-2],eax
-  mov         eax, [rsp+1Ch]
-  mov         [rbx],eax
-  mov         eax, [rsp+6Ch]
-  mov         [r10+rbx],eax
-  mov         eax,[rsp+2Ch]
-  mov         [rbx+r10*2],eax
-  mov         eax,[rsp+7Ch]
-  mov         [rdx+rbx],eax
-  lea         r11,[rsp+140h]
-  mov         rbx, [r11+28h]
-  mov         rsp,r11
-  pop         r12
-  pop         rbp
-  pop         rbx
-  ret
+    movdqa      xmm5,[rsp+10h]
+    movsxd      r10,r8d
+    mov         eax,[r10+rcx-2]
+    lea         rdx,[r10+r10*2]
+    mov         [rsp+20h],eax
+    mov         eax,[rcx+r10*2-2]
+    mov         [rsp+30h],eax
+    mov         eax,[rdx+rcx-2]
+    movdqa      xmm2,[rsp+20h]
+    mov         [rsp+40h],eax
+    mov         eax, [rdi-2]
+    movdqa      xmm4,[rsp+30h]
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rdi-2]
+    movdqa      xmm3,[rsp+40h]
+    mov         [rsp+60h],eax
+    mov         eax,[rdi+r10*2-2]
+    punpckldq   xmm5,[rsp+50h]
+    mov         [rsp+70h],eax
+    mov         eax, [rdx+rdi-2]
+    punpckldq   xmm2, [rsp+60h]
+    mov          [rsp+80h],eax
+    mov         eax,[r11]
+    punpckldq   xmm4, [rsp+70h]
+    mov         [rsp+50h],eax
+    mov         eax,[rbx]
+    punpckldq   xmm3,[rsp+80h]
+    mov         [rsp+60h],eax
+    mov         eax,[r10+r11]
+    movdqa      xmm0, [rsp+50h]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm0,[rsp+50h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+60h],eax
+    mov         eax,[r11+r10*2]
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax,[rbx+r10*2]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    mov         eax, [rdx+r11]
+    movdqa      xmm15,xmm1
+    punpckldq   xmm0,[rsp+60h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+50h],xmm0
+    mov         [rsp+50h],eax
+    mov         eax, [rdx+rbx]
+    movdqa      xmm0,[rsp+50h]
+    mov         [rsp+60h],eax
+    punpckldq   xmm0, [rsp+60h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm15,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm12,xmm15
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm12,xmm0
+    punpckhdq   xmm15,xmm0
+    movdqa      xmm0,xmm1
+    movdqa      xmm11,xmm12
+    punpckldq   xmm0,xmm5
+    punpckhdq   xmm1,xmm5
+    punpcklqdq  xmm11,xmm0
+    punpckhqdq  xmm12,xmm0
+    movsx       eax,r9w
+    movdqa      xmm14,xmm15
+    punpcklqdq  xmm14,xmm1
+    punpckhqdq  xmm15,xmm1
+    pxor        xmm1,xmm1
+    movd        xmm0,eax
+    movdqa      xmm4,xmm12
+    movdqa      xmm8,xmm11
+    mov         eax, ebp ; iBeta
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm4,xmm1
+    punpckhbw   xmm12,xmm1
+    movdqa      xmm9,xmm14
+    movdqa      xmm7,xmm15
+    movdqa      xmm10,xmm15
+    pshufd      xmm13,xmm0,0
+    punpcklbw   xmm9,xmm1
+    punpckhbw   xmm14,xmm1
+    movdqa      xmm6,xmm13
+    movd        xmm0,eax
+    movdqa      [rsp],xmm11
+    mov         eax,2
+    cwde
+    punpckhbw   xmm11,xmm1
+    punpckhbw   xmm10,xmm1
+    punpcklbw   xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    punpcklbw   xmm8,xmm1
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm1,xmm8
+    movdqa      xmm0,xmm4
+    psubw       xmm0,xmm9
+    psubw       xmm1,xmm4
+    movdqa      xmm2,xmm3
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm6,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm3
+    pcmpgtw     xmm2,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm0,xmm7
+    movdqa      xmm2,xmm3
+    psubw       xmm0,xmm9
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm1,xmm0
+    pand        xmm6,xmm1
+    movdqa      xmm0,xmm12
+    movdqa      xmm1,xmm11
+    psubw       xmm0,xmm14
+    psubw       xmm1,xmm12
+    movdqa      xmm5,xmm6
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm13,xmm0
+    pabsw       xmm0,xmm1
+    movdqa      xmm1,xmm8
+    pcmpgtw     xmm2,xmm0
+    paddw       xmm1,xmm8
+    movdqa      xmm0,xmm10
+    pand        xmm13,xmm2
+    psubw       xmm0,xmm14
+    paddw       xmm1,xmm4
+    movdqa      xmm2,xmm11
+    pabsw       xmm0,xmm0
+    paddw       xmm2,xmm11
+    paddw       xmm1,xmm7
+    pcmpgtw     xmm3,xmm0
+    paddw       xmm2,xmm12
+    movd        xmm0,eax
+    pand        xmm13,xmm3
+    paddw       xmm2,xmm10
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm3,xmm0,0
+    movdqa      xmm0,xmm6
+    paddw       xmm1,xmm3
+    pandn       xmm0,xmm4
+    paddw       xmm2,xmm3
+    psraw       xmm1,2
+    pand        xmm5,xmm1
+    por         xmm5,xmm0
+    paddw       xmm7,xmm7
+    paddw       xmm10,xmm10
+    psraw       xmm2,2
+    movdqa      xmm1,xmm13
+    movdqa      xmm0,xmm13
+    pandn       xmm0,xmm12
+    pand        xmm1,xmm2
+    paddw       xmm7,xmm9
+    por         xmm1,xmm0
+    paddw       xmm10,xmm14
+    paddw       xmm7,xmm8
+    movdqa      xmm0,xmm13
+    packuswb    xmm5,xmm1
+    paddw       xmm7,xmm3
+    paddw       xmm10,xmm11
+    movdqa      xmm1,xmm6
+    paddw       xmm10,xmm3
+    pandn       xmm6,xmm9
+    psraw       xmm7,2
+    pand        xmm1,xmm7
+    psraw       xmm10,2
+    pandn       xmm13,xmm14
+    pand        xmm0,xmm10
+    por         xmm1,xmm6
+    movdqa      xmm6,[rsp]
+    movdqa      xmm4,xmm6
+    por         xmm0,xmm13
+    punpcklbw   xmm4,xmm5
+    punpckhbw   xmm6,xmm5
+    movdqa      xmm3,xmm4
+    packuswb    xmm1,xmm0
+    movdqa      xmm0,xmm1
+    punpckhbw   xmm1,xmm15
+    punpcklbw   xmm0,xmm15
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm6
+    movdqa      xmm2,xmm3
+    punpcklwd   xmm0,xmm1
+    punpckhwd   xmm6,xmm1
+    movdqa      xmm1,xmm4
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm6
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm6
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+10h],xmm0
+    movdqa      [rsp+60h],xmm2
+    movdqa      xmm0,xmm3
+    mov         eax,[rsp+10h]
+    mov         [rcx-2],eax
+    mov         eax,[rsp+60h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [r10+rcx-2],eax
+    movdqa      [rsp+20h],xmm0
+    mov         eax, [rsp+20h]
+    movdqa      [rsp+70h],xmm3
+    mov         [rcx+r10*2-2],eax
+    mov         eax,[rsp+70h]
+    mov         [rdx+rcx-2],eax
+    mov         eax,[rsp+18h]
+    mov         [r11],eax
+    mov         eax,[rsp+68h]
+    mov         [r10+r11],eax
+    mov         eax,[rsp+28h]
+    mov         [r11+r10*2],eax
+    mov         eax,[rsp+78h]
+    mov         [rdx+r11],eax
+    mov         eax,[rsp+14h]
+    mov         [rdi-2],eax
+    mov         eax,[rsp+64h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+24h]
+    mov         [rdi+r10*2-2],eax
+    mov         eax, [rsp+74h]
+    mov         [rdx+rdi-2],eax
+    mov         eax, [rsp+1Ch]
+    mov         [rbx],eax
+    mov         eax, [rsp+6Ch]
+    mov         [r10+rbx],eax
+    mov         eax,[rsp+2Ch]
+    mov         [rbx+r10*2],eax
+    mov         eax,[rsp+7Ch]
+    mov         [rdx+rbx],eax
+    lea         r11,[rsp+140h]
+    mov         rbx, [r11+28h]
+    mov         rsp,r11
+    pop         r12
+    pop         rbp
+    pop         rbx
+    ret
 
 
 WELS_EXTERN DeblockChromaLt4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        r12
-  push        r13
-  push        r14
-  sub         rsp,170h
+    mov         rax,rsp
+    push        rbx
+    push        rbp
+    push        r12
+    push        r13
+    push        r14
+    sub         rsp,170h
 
-  mov         r13,   r8
-  mov         r14,   r9
-  mov         r8,    rdx
-  mov         r9,    rcx
-  mov         rdx,   rdi
-  mov         rcx,   rsi
+    mov         r13,   r8
+    mov         r14,   r9
+    mov         r8,    rdx
+    mov         r9,    rcx
+    mov         rdx,   rdi
+    mov         rcx,   rsi
 
-  movsxd      rsi,r8d
-  lea         eax,[r8*4]
-  mov         r11d,r9d
-  movsxd      r10,eax
-  mov         eax, [rcx-2]
-  mov         r12,rdx
-  mov         [rsp+40h],eax
-  mov         eax, [rsi+rcx-2]
-  lea         rbx,[r10+rcx-2]
-  movdqa      xmm5,[rsp+40h]
-  mov         [rsp+50h],eax
-  mov         eax, [rcx+rsi*2-2]
-  lea         rbp,[r10+rdx-2]
-  movdqa      xmm2, [rsp+50h]
-  mov         [rsp+60h],eax
-  lea         r10,[rsi+rsi*2]
-  mov         rdi,rcx
-  mov         eax,[r10+rcx-2]
-  movdqa      xmm4,[rsp+60h]
-  mov         [rsp+70h],eax
-  mov         eax,[rdx-2]
-  mov         [rsp+80h],eax
-  mov         eax, [rsi+rdx-2]
-  movdqa      xmm3,[rsp+70h]
-  mov         [rsp+90h],eax
-  mov         eax,[rdx+rsi*2-2]
-  punpckldq   xmm5,[rsp+80h]
-  mov         [rsp+0A0h],eax
-  mov         eax, [r10+rdx-2]
-  punpckldq   xmm2,[rsp+90h]
-  mov         [rsp+0B0h],eax
-  mov         eax, [rbx]
-  punpckldq   xmm4,[rsp+0A0h]
-  mov         [rsp+80h],eax
-  mov         eax,[rbp]
-  punpckldq   xmm3,[rsp+0B0h]
-  mov         [rsp+90h],eax
-  mov         eax,[rsi+rbx]
-  movdqa      xmm0,[rsp+80h]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rsi+rbp]
-  movdqa      xmm0,[rsp+80h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+90h],eax
-  mov         eax,[rbx+rsi*2]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rbp+rsi*2]
-  movdqa      xmm0, [rsp+80h]
-  mov         [rsp+90h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm7,xmm1
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax, [r10+rbp]
-  movdqa      xmm0,[rsp+80h]
-  mov         [rsp+90h],eax
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm7,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm6,xmm7
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm6,xmm0
-  punpckhdq   xmm7,xmm0
-  movdqa      xmm0,xmm1
-  punpckldq   xmm0,xmm5
-  mov         rax, r14    ; pTC
-  punpckhdq   xmm1,xmm5
-  movdqa      xmm9,xmm6
-  punpckhqdq  xmm6,xmm0
-  punpcklqdq  xmm9,xmm0
-  movdqa      xmm2,xmm7
-  movdqa      xmm13,xmm6
-  movdqa      xmm4,xmm9
-  movdqa      [rsp+10h],xmm9
-  punpcklqdq  xmm2,xmm1
-  punpckhqdq  xmm7,xmm1
-  pxor        xmm1,xmm1
-  movsx       ecx,byte [rax+3]
-  movsx       edx,byte [rax+2]
-  movsx       r8d,byte [rax+1]
-  movsx       r9d,byte [rax]
-  movdqa      xmm10,xmm1
-  movdqa      xmm15,xmm2
-  punpckhbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm4,xmm1
-  movsx       eax,r11w
-  mov         word [rsp+0Eh],cx
-  mov         word [rsp+0Ch],cx
-  movdqa      xmm3,xmm7
-  movdqa      xmm8,xmm7
-  movdqa      [rsp+20h],xmm7
-  punpcklbw   xmm15,xmm1
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm3,xmm1
-  mov         word [rsp+0Ah],dx
-  mov         word [rsp+8],dx
-  mov         word [rsp+6],r8w
-  movd        xmm0,eax
-  movdqa      [rsp+30h],xmm6
-  punpckhbw   xmm9,xmm1
-  punpckhbw   xmm8,xmm1
-  punpcklwd   xmm0,xmm0
-  mov         eax, r13d   ; iBeta
-  mov         word [rsp+4],r8w
-  mov         word [rsp+2],r9w
-  pshufd      xmm12,xmm0,0
-  mov         word [rsp],r9w
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  movdqa      xmm14, [rsp]
-  movdqa      [rsp],xmm2
-  movdqa      xmm2,xmm12
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  psubw       xmm10,xmm14
-  movd        xmm0,eax
-  movdqa      xmm7,xmm14
-  movdqa      xmm6,xmm14
-  pcmpgtw     xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  movdqa      xmm0,xmm4
-  movdqa      xmm1,xmm15
-  psubw       xmm4,xmm13
-  psubw       xmm0,xmm3
-  psubw       xmm1,xmm13
-  psubw       xmm3,xmm15
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm10
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm11
-  movdqa      xmm0,xmm13
-  psubw       xmm0,xmm15
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm11
-  movdqa      xmm3,[rsp+30h]
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm9
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm8
-  psubw       xmm9,xmm3
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  psubw       xmm15,xmm6
-  paddw       xmm13,xmm6
-  movdqa      xmm2,[rsp]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  psubw       xmm8,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm3
-  movdqa      xmm5,[rsp+10h]
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  movdqa      xmm4,xmm5
-  pabsw       xmm0,xmm0
-  pmaxsw      xmm10,xmm1
-  movdqa      xmm1,xmm11
-  pcmpgtw     xmm12,xmm0
-  pabsw       xmm0,xmm9
-  pminsw      xmm14,xmm10
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm8
-  pcmpgtw     xmm11,xmm0
-  pand        xmm12,xmm1
-  movdqa      xmm1,[rsp+20h]
-  pand        xmm12,xmm11
-  pand        xmm12,xmm7
-  pand        xmm14,xmm12
-  paddw       xmm3,xmm14
-  psubw       xmm2,xmm14
-  packuswb    xmm13,xmm3
-  packuswb    xmm15,xmm2
-  punpcklbw   xmm4,xmm13
-  punpckhbw   xmm5,xmm13
-  movdqa      xmm0,xmm15
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm4
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm2,xmm3
-  movdqa      xmm1,xmm4
-  punpcklwd   xmm0,xmm15
-  punpckhwd   xmm5,xmm15
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm5
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm5
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+40h],xmm0
-  movdqa      xmm0,xmm3
-  movdqa      [rsp+90h],xmm2
-  mov         eax,[rsp+40h]
-  mov         [rdi-2],eax
-  mov         eax, [rsp+90h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [rsi+rdi-2],eax
-  movdqa      [rsp+50h],xmm0
-  mov         eax,[rsp+50h]
-  movdqa      [rsp+0A0h],xmm3
-  mov         [rdi+rsi*2-2],eax
-  mov         eax,[rsp+0A0h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+48h]
-  mov         [rbx],eax
-  mov         eax,[rsp+98h]
-  mov         [rsi+rbx],eax
-  mov         eax,[rsp+58h]
-  mov         [rbx+rsi*2],eax
-  mov         eax, [rsp+0A8h]
-  mov         [r10+rbx],eax
-  mov         eax, [rsp+44h]
-  mov         [r12-2],eax
-  mov         eax,[rsp+94h]
-  mov         [rsi+r12-2],eax
-  mov         eax,[rsp+54h]
-  mov         [r12+rsi*2-2],eax
-  mov         eax, [rsp+0A4h]
-  mov         [r10+r12-2],eax
-  mov         eax,[rsp+4Ch]
-  mov         [rbp],eax
-  mov         eax,[rsp+9Ch]
-  mov         [rsi+rbp],eax
-  mov         eax, [rsp+5Ch]
-  mov         [rbp+rsi*2],eax
-  mov         eax,[rsp+0ACh]
-  mov         [r10+rbp],eax
-  lea         r11,[rsp+170h]
-  mov         rsp,r11
-  pop         r14
-  pop         r13
-  pop         r12
-  pop         rbp
-  pop         rbx
-  ret
+    movsxd      rsi,r8d
+    lea         eax,[r8*4]
+    mov         r11d,r9d
+    movsxd      r10,eax
+    mov         eax, [rcx-2]
+    mov         r12,rdx
+    mov         [rsp+40h],eax
+    mov         eax, [rsi+rcx-2]
+    lea         rbx,[r10+rcx-2]
+    movdqa      xmm5,[rsp+40h]
+    mov         [rsp+50h],eax
+    mov         eax, [rcx+rsi*2-2]
+    lea         rbp,[r10+rdx-2]
+    movdqa      xmm2, [rsp+50h]
+    mov         [rsp+60h],eax
+    lea         r10,[rsi+rsi*2]
+    mov         rdi,rcx
+    mov         eax,[r10+rcx-2]
+    movdqa      xmm4,[rsp+60h]
+    mov         [rsp+70h],eax
+    mov         eax,[rdx-2]
+    mov         [rsp+80h],eax
+    mov         eax, [rsi+rdx-2]
+    movdqa      xmm3,[rsp+70h]
+    mov         [rsp+90h],eax
+    mov         eax,[rdx+rsi*2-2]
+    punpckldq   xmm5,[rsp+80h]
+    mov         [rsp+0A0h],eax
+    mov         eax, [r10+rdx-2]
+    punpckldq   xmm2,[rsp+90h]
+    mov         [rsp+0B0h],eax
+    mov         eax, [rbx]
+    punpckldq   xmm4,[rsp+0A0h]
+    mov         [rsp+80h],eax
+    mov         eax,[rbp]
+    punpckldq   xmm3,[rsp+0B0h]
+    mov         [rsp+90h],eax
+    mov         eax,[rsi+rbx]
+    movdqa      xmm0,[rsp+80h]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm5,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rsi+rbp]
+    movdqa      xmm0,[rsp+80h]
+    movdqa      xmm1,xmm5
+    mov         [rsp+90h],eax
+    mov         eax,[rbx+rsi*2]
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm2,xmm0
+    punpcklbw   xmm1,xmm2
+    punpckhbw   xmm5,xmm2
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax,[rbp+rsi*2]
+    movdqa      xmm0, [rsp+80h]
+    mov         [rsp+90h],eax
+    mov         eax,[r10+rbx]
+    movdqa      xmm7,xmm1
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm4,xmm0
+    movdqa      [rsp+80h],xmm0
+    mov         [rsp+80h],eax
+    mov         eax, [r10+rbp]
+    movdqa      xmm0,[rsp+80h]
+    mov         [rsp+90h],eax
+    punpckldq   xmm0,[rsp+90h]
+    punpcklqdq  xmm3,xmm0
+    movdqa      xmm0,xmm4
+    punpcklbw   xmm0,xmm3
+    punpckhbw   xmm4,xmm3
+    punpcklwd   xmm7,xmm0
+    punpckhwd   xmm1,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm6,xmm7
+    punpcklwd   xmm0,xmm4
+    punpckhwd   xmm5,xmm4
+    punpckldq   xmm6,xmm0
+    punpckhdq   xmm7,xmm0
+    movdqa      xmm0,xmm1
+    punpckldq   xmm0,xmm5
+    mov         rax, r14    ; pTC
+    punpckhdq   xmm1,xmm5
+    movdqa      xmm9,xmm6
+    punpckhqdq  xmm6,xmm0
+    punpcklqdq  xmm9,xmm0
+    movdqa      xmm2,xmm7
+    movdqa      xmm13,xmm6
+    movdqa      xmm4,xmm9
+    movdqa      [rsp+10h],xmm9
+    punpcklqdq  xmm2,xmm1
+    punpckhqdq  xmm7,xmm1
+    pxor        xmm1,xmm1
+    movsx       ecx,byte [rax+3]
+    movsx       edx,byte [rax+2]
+    movsx       r8d,byte [rax+1]
+    movsx       r9d,byte [rax]
+    movdqa      xmm10,xmm1
+    movdqa      xmm15,xmm2
+    punpckhbw   xmm2,xmm1
+    punpckhbw   xmm6,xmm1
+    punpcklbw   xmm4,xmm1
+    movsx       eax,r11w
+    mov         word [rsp+0Eh],cx
+    mov         word [rsp+0Ch],cx
+    movdqa      xmm3,xmm7
+    movdqa      xmm8,xmm7
+    movdqa      [rsp+20h],xmm7
+    punpcklbw   xmm15,xmm1
+    punpcklbw   xmm13,xmm1
+    punpcklbw   xmm3,xmm1
+    mov         word [rsp+0Ah],dx
+    mov         word [rsp+8],dx
+    mov         word [rsp+6],r8w
+    movd        xmm0,eax
+    movdqa      [rsp+30h],xmm6
+    punpckhbw   xmm9,xmm1
+    punpckhbw   xmm8,xmm1
+    punpcklwd   xmm0,xmm0
+    mov         eax, r13d   ; iBeta
+    mov         word [rsp+4],r8w
+    mov         word [rsp+2],r9w
+    pshufd      xmm12,xmm0,0
+    mov         word [rsp],r9w
+    movd        xmm0,eax
+    mov         eax,4
+    cwde
+    movdqa      xmm14, [rsp]
+    movdqa      [rsp],xmm2
+    movdqa      xmm2,xmm12
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm11,xmm0,0
+    psubw       xmm10,xmm14
+    movd        xmm0,eax
+    movdqa      xmm7,xmm14
+    movdqa      xmm6,xmm14
+    pcmpgtw     xmm7,xmm1
+    punpcklwd   xmm0,xmm0
+    pshufd      xmm5,xmm0,0
+    movdqa      xmm0,xmm4
+    movdqa      xmm1,xmm15
+    psubw       xmm4,xmm13
+    psubw       xmm0,xmm3
+    psubw       xmm1,xmm13
+    psubw       xmm3,xmm15
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm10
+    psraw       xmm1,3
+    pmaxsw      xmm0,xmm1
+    pminsw      xmm6,xmm0
+    movdqa      xmm1,xmm11
+    movdqa      xmm0,xmm13
+    psubw       xmm0,xmm15
+    pabsw       xmm0,xmm0
+    pcmpgtw     xmm2,xmm0
+    pabsw       xmm0,xmm4
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm3
+    pand        xmm2,xmm1
+    movdqa      xmm1,xmm11
+    movdqa      xmm3,[rsp+30h]
+    pcmpgtw     xmm1,xmm0
+    movdqa      xmm0,xmm9
+    pand        xmm2,xmm1
+    psubw       xmm0,xmm8
+    psubw       xmm9,xmm3
+    pand        xmm2,xmm7
+    pand        xmm6,xmm2
+    psubw       xmm15,xmm6
+    paddw       xmm13,xmm6
+    movdqa      xmm2,[rsp]
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    psubw       xmm8,xmm2
+    psllw       xmm1,2
+    paddw       xmm1,xmm0
+    paddw       xmm1,xmm5
+    movdqa      xmm0,xmm3
+    movdqa      xmm5,[rsp+10h]
+    psubw       xmm0,xmm2
+    psraw       xmm1,3
+    movdqa      xmm4,xmm5
+    pabsw       xmm0,xmm0
+    pmaxsw      xmm10,xmm1
+    movdqa      xmm1,xmm11
+    pcmpgtw     xmm12,xmm0
+    pabsw       xmm0,xmm9
+    pminsw      xmm14,xmm10
+    pcmpgtw     xmm1,xmm0
+    pabsw       xmm0,xmm8
+    pcmpgtw     xmm11,xmm0
+    pand        xmm12,xmm1
+    movdqa      xmm1,[rsp+20h]
+    pand        xmm12,xmm11
+    pand        xmm12,xmm7
+    pand        xmm14,xmm12
+    paddw       xmm3,xmm14
+    psubw       xmm2,xmm14
+    packuswb    xmm13,xmm3
+    packuswb    xmm15,xmm2
+    punpcklbw   xmm4,xmm13
+    punpckhbw   xmm5,xmm13
+    movdqa      xmm0,xmm15
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm15,xmm1
+    movdqa      xmm3,xmm4
+    punpcklwd   xmm3,xmm0
+    punpckhwd   xmm4,xmm0
+    movdqa      xmm0,xmm5
+    movdqa      xmm2,xmm3
+    movdqa      xmm1,xmm4
+    punpcklwd   xmm0,xmm15
+    punpckhwd   xmm5,xmm15
+    punpckldq   xmm2,xmm0
+    punpckhdq   xmm3,xmm0
+    punpckldq   xmm1,xmm5
+    movdqa      xmm0,xmm2
+    punpcklqdq  xmm0,xmm1
+    punpckhdq   xmm4,xmm5
+    punpckhqdq  xmm2,xmm1
+    movdqa      [rsp+40h],xmm0
+    movdqa      xmm0,xmm3
+    movdqa      [rsp+90h],xmm2
+    mov         eax,[rsp+40h]
+    mov         [rdi-2],eax
+    mov         eax, [rsp+90h]
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm3,xmm4
+    mov         [rsi+rdi-2],eax
+    movdqa      [rsp+50h],xmm0
+    mov         eax,[rsp+50h]
+    movdqa      [rsp+0A0h],xmm3
+    mov         [rdi+rsi*2-2],eax
+    mov         eax,[rsp+0A0h]
+    mov         [r10+rdi-2],eax
+    mov         eax,[rsp+48h]
+    mov         [rbx],eax
+    mov         eax,[rsp+98h]
+    mov         [rsi+rbx],eax
+    mov         eax,[rsp+58h]
+    mov         [rbx+rsi*2],eax
+    mov         eax, [rsp+0A8h]
+    mov         [r10+rbx],eax
+    mov         eax, [rsp+44h]
+    mov         [r12-2],eax
+    mov         eax,[rsp+94h]
+    mov         [rsi+r12-2],eax
+    mov         eax,[rsp+54h]
+    mov         [r12+rsi*2-2],eax
+    mov         eax, [rsp+0A4h]
+    mov         [r10+r12-2],eax
+    mov         eax,[rsp+4Ch]
+    mov         [rbp],eax
+    mov         eax,[rsp+9Ch]
+    mov         [rsi+rbp],eax
+    mov         eax, [rsp+5Ch]
+    mov         [rbp+rsi*2],eax
+    mov         eax,[rsp+0ACh]
+    mov         [r10+rbp],eax
+    lea         r11,[rsp+170h]
+    mov         rsp,r11
+    pop         r14
+    pop         r13
+    pop         r12
+    pop         rbp
+    pop         rbx
+    ret
 
 
 
@@ -3233,166 +3233,166 @@
 ;                             int32_t iAlpha, int32_t iBeta)
 ;********************************************************************************
 WELS_EXTERN   DeblockChromaEq4V_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,68h
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx]
-  movq        xmm5,[edx+ecx]
-  push        esi
-  push        edi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  movq        xmm1,[edi]
-  mov         edi,ecx
-  sub         edi,esi
-  movq        xmm2,[edi]
-  punpcklqdq  xmm1,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm2,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm3,[edi]
-  punpcklqdq  xmm2,xmm3
-  movq        xmm3,[eax]
-  punpcklqdq  xmm3,xmm4
-  movq        xmm4,[edx+eax]
-  mov       edx, [ebp + 14h]
-  punpcklqdq  xmm4,xmm5
-  movd        xmm5,edx
-  mov       edx, [ebp + 18h]
-  pxor        xmm0,xmm0
-  movdqa      xmm6,xmm5
-  punpcklwd   xmm6,xmm5
-  pshufd      xmm5,xmm6,0
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,xmm1
-  punpckhbw   xmm1,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+40h],xmm1
-  movdqa      [esp+60h],xmm7
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+10h],xmm7
-  movdqa      xmm7,xmm3
-  punpcklbw   xmm7,xmm0
-  punpckhbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm7,xmm4
-  punpckhbw   xmm4,xmm0
-  punpckhbw   xmm2,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+30h],xmm3
-  movdqa      xmm3,[esp+10h]
-  movdqa      xmm1,xmm3
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      [esp+20h],xmm4
-  movdqa      xmm0,xmm5
-  pcmpgtw     xmm0,xmm1
-  movdqa      xmm1,[esp+60h]
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  pand        xmm0,xmm4
-  movdqa      xmm1,xmm7
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,xmm2
-  psubw       xmm1,[esp+30h]
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  pand        xmm0,xmm4
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,[esp+20h]
-  psubw       xmm1,[esp+30h]
-  pand        xmm5,xmm4
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  pand        xmm5,xmm6
-  mov         edx,2
-  movsx       edx,dx
-  movd        xmm1,edx
-  movdqa      xmm4,xmm1
-  punpcklwd   xmm4,xmm1
-  pshufd      xmm1,xmm4,0
-  movdqa      xmm4,[esp+60h]
-  movdqa      xmm6,xmm4
-  paddw       xmm6,xmm4
-  paddw       xmm6,xmm3
-  paddw       xmm6,xmm7
-  movdqa      [esp+10h],xmm1
-  paddw       xmm6,[esp+10h]
-  psraw       xmm6,2
-  movdqa      xmm4,xmm0
-  pandn       xmm4,xmm3
-  movdqa      xmm3,[esp+40h]
-  movdqa      xmm1,xmm0
-  pand        xmm1,xmm6
-  por         xmm1,xmm4
-  movdqa      xmm6,xmm3
-  paddw       xmm6,xmm3
-  movdqa      xmm3,[esp+10h]
-  paddw       xmm6,xmm2
-  paddw       xmm6,[esp+20h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm4,xmm5
-  pand        xmm4,xmm6
-  movdqa      xmm6,xmm5
-  pandn       xmm6,xmm2
-  por         xmm4,xmm6
-  packuswb    xmm1,xmm4
-  movdqa      xmm4,[esp+50h]
-  movdqa      xmm6,xmm7
-  paddw       xmm6,xmm7
-  paddw       xmm6,xmm4
-  paddw       xmm6,[esp+60h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm2,xmm0
-  pand        xmm2,xmm6
-  pandn       xmm0,xmm4
-  por         xmm2,xmm0
-  movdqa      xmm0,[esp+20h]
-  movdqa      xmm6,xmm0
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[esp+30h]
-  paddw       xmm6,xmm0
-  paddw       xmm6,[esp+40h]
-  movdqa      xmm4,xmm5
-  paddw       xmm6,xmm3
-  movq        [esi],xmm1
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  pandn       xmm5,xmm0
-  por         xmm4,xmm5
-  packuswb    xmm2,xmm4
-  movq        [eax],xmm2
-  psrldq      xmm1,8
-  movq        [edi],xmm1
-  pop         edi
-  psrldq      xmm2,8
-  movq        [ecx],xmm2
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,68h
+    mov         edx,[ebp+10h]      ;  iStride
+    mov         eax,[ebp+8]        ;  pPixCb
+    mov         ecx,[ebp+0Ch]      ;  pPixCr
+    movq        xmm4,[ecx]
+    movq        xmm5,[edx+ecx]
+    push        esi
+    push        edi
+    lea         esi,[edx+edx]
+    mov         edi,eax
+    sub         edi,esi
+    movq        xmm1,[edi]
+    mov         edi,ecx
+    sub         edi,esi
+    movq        xmm2,[edi]
+    punpcklqdq  xmm1,xmm2
+    mov         esi,eax
+    sub         esi,edx
+    movq        xmm2,[esi]
+    mov         edi,ecx
+    sub         edi,edx
+    movq        xmm3,[edi]
+    punpcklqdq  xmm2,xmm3
+    movq        xmm3,[eax]
+    punpcklqdq  xmm3,xmm4
+    movq        xmm4,[edx+eax]
+    mov       edx, [ebp + 14h]
+    punpcklqdq  xmm4,xmm5
+    movd        xmm5,edx
+    mov       edx, [ebp + 18h]
+    pxor        xmm0,xmm0
+    movdqa      xmm6,xmm5
+    punpcklwd   xmm6,xmm5
+    pshufd      xmm5,xmm6,0
+    movd        xmm6,edx
+    movdqa      xmm7,xmm6
+    punpcklwd   xmm7,xmm6
+    pshufd      xmm6,xmm7,0
+    movdqa      xmm7,xmm1
+    punpckhbw   xmm1,xmm0
+    punpcklbw   xmm7,xmm0
+    movdqa      [esp+40h],xmm1
+    movdqa      [esp+60h],xmm7
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm7,xmm0
+    movdqa      [esp+10h],xmm7
+    movdqa      xmm7,xmm3
+    punpcklbw   xmm7,xmm0
+    punpckhbw   xmm3,xmm0
+    movdqa      [esp+50h],xmm7
+    movdqa      xmm7,xmm4
+    punpckhbw   xmm4,xmm0
+    punpckhbw   xmm2,xmm0
+    punpcklbw   xmm7,xmm0
+    movdqa      [esp+30h],xmm3
+    movdqa      xmm3,[esp+10h]
+    movdqa      xmm1,xmm3
+    psubw       xmm1,[esp+50h]
+    pabsw       xmm1,xmm1
+    movdqa      [esp+20h],xmm4
+    movdqa      xmm0,xmm5
+    pcmpgtw     xmm0,xmm1
+    movdqa      xmm1,[esp+60h]
+    psubw       xmm1,xmm3
+    pabsw       xmm1,xmm1
+    movdqa      xmm4,xmm6
+    pcmpgtw     xmm4,xmm1
+    pand        xmm0,xmm4
+    movdqa      xmm1,xmm7
+    psubw       xmm1,[esp+50h]
+    pabsw       xmm1,xmm1
+    movdqa      xmm4,xmm6
+    pcmpgtw     xmm4,xmm1
+    movdqa      xmm1,xmm2
+    psubw       xmm1,[esp+30h]
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm5,xmm1
+    movdqa      xmm1,[esp+40h]
+    pand        xmm0,xmm4
+    psubw       xmm1,xmm2
+    pabsw       xmm1,xmm1
+    movdqa      xmm4,xmm6
+    pcmpgtw     xmm4,xmm1
+    movdqa      xmm1,[esp+20h]
+    psubw       xmm1,[esp+30h]
+    pand        xmm5,xmm4
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm6,xmm1
+    pand        xmm5,xmm6
+    mov         edx,2
+    movsx       edx,dx
+    movd        xmm1,edx
+    movdqa      xmm4,xmm1
+    punpcklwd   xmm4,xmm1
+    pshufd      xmm1,xmm4,0
+    movdqa      xmm4,[esp+60h]
+    movdqa      xmm6,xmm4
+    paddw       xmm6,xmm4
+    paddw       xmm6,xmm3
+    paddw       xmm6,xmm7
+    movdqa      [esp+10h],xmm1
+    paddw       xmm6,[esp+10h]
+    psraw       xmm6,2
+    movdqa      xmm4,xmm0
+    pandn       xmm4,xmm3
+    movdqa      xmm3,[esp+40h]
+    movdqa      xmm1,xmm0
+    pand        xmm1,xmm6
+    por         xmm1,xmm4
+    movdqa      xmm6,xmm3
+    paddw       xmm6,xmm3
+    movdqa      xmm3,[esp+10h]
+    paddw       xmm6,xmm2
+    paddw       xmm6,[esp+20h]
+    paddw       xmm6,xmm3
+    psraw       xmm6,2
+    movdqa      xmm4,xmm5
+    pand        xmm4,xmm6
+    movdqa      xmm6,xmm5
+    pandn       xmm6,xmm2
+    por         xmm4,xmm6
+    packuswb    xmm1,xmm4
+    movdqa      xmm4,[esp+50h]
+    movdqa      xmm6,xmm7
+    paddw       xmm6,xmm7
+    paddw       xmm6,xmm4
+    paddw       xmm6,[esp+60h]
+    paddw       xmm6,xmm3
+    psraw       xmm6,2
+    movdqa      xmm2,xmm0
+    pand        xmm2,xmm6
+    pandn       xmm0,xmm4
+    por         xmm2,xmm0
+    movdqa      xmm0,[esp+20h]
+    movdqa      xmm6,xmm0
+    paddw       xmm6,xmm0
+    movdqa      xmm0,[esp+30h]
+    paddw       xmm6,xmm0
+    paddw       xmm6,[esp+40h]
+    movdqa      xmm4,xmm5
+    paddw       xmm6,xmm3
+    movq        [esi],xmm1
+    psraw       xmm6,2
+    pand        xmm4,xmm6
+    pandn       xmm5,xmm0
+    por         xmm4,xmm5
+    packuswb    xmm2,xmm4
+    movq        [eax],xmm2
+    psrldq      xmm1,8
+    movq        [edi],xmm1
+    pop         edi
+    psrldq      xmm2,8
+    movq        [ecx],xmm2
+    pop         esi
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 ;******************************************************************************
 ; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3400,200 +3400,200 @@
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockChromaLt4V_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0E4h
-  push        ebx
-  push        esi
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2]
-  push        edi
-  movsx       di,byte [esi+3]
-  mov         word [esp+0Ch],bx
-  movsx       bx,byte  [esi+1]
-  movsx       esi,byte  [esi]
-  mov         word  [esp+0Eh],si
-  movzx       esi,di
-  movd        xmm1,esi
-  movzx       esi,di
-  movd        xmm2,esi
-  mov         si,word  [esp+0Ch]
-  mov         edx, [ebp + 10h]
-  mov         eax, [ebp + 08h]
-  movzx       edi,si
-  movzx       esi,si
-  mov         ecx, [ebp + 0Ch]
-  movd        xmm4,esi
-  movzx       esi,bx
-  movd        xmm5,esi
-  movd        xmm3,edi
-  movzx       esi,bx
-  movd        xmm6,esi
-  mov         si,word [esp+0Eh]
-  movzx       edi,si
-  movzx       esi,si
-  punpcklwd   xmm6,xmm2
-  pxor        xmm0,xmm0
-  movdqa      [esp+40h],xmm0
-  movd        xmm7,edi
-  movd        xmm0,esi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  punpcklwd   xmm0,xmm4
-  movq        xmm4,[edx+ecx]
-  punpcklwd   xmm7,xmm3
-  movq        xmm3,[eax]
-  punpcklwd   xmm0,xmm6
-  movq        xmm6,[edi]
-  punpcklwd   xmm7,xmm5
-  punpcklwd   xmm0,xmm7
-  mov         edi,ecx
-  sub         edi,esi
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+60h],xmm2
-  movq        xmm2, [edi]
-  punpcklqdq  xmm6,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm7,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm2,[edi]
-  punpcklqdq  xmm7,xmm2
-  movq        xmm2,[ecx]
-  punpcklqdq  xmm3,xmm2
-  movq        xmm2,[edx+eax]
-  movsx       edx,word [ebp + 14h]
-  punpcklqdq  xmm2,xmm4
-  movdqa      [esp+0E0h],xmm2
-  movd        xmm2,edx
-  movsx       edx,word [ebp + 18h]
-  movdqa      xmm4,xmm2
-  punpcklwd   xmm4,xmm2
-  movd        xmm2,edx
-  movdqa      xmm5,xmm2
-  punpcklwd   xmm5,xmm2
-  pshufd      xmm2,xmm5,0
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  movdqa      [esp+0D0h],xmm3
-  pshufd      xmm4,xmm4,0
-  movdqa      [esp+30h],xmm2
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+80h],xmm6
-  movdqa      xmm6,[esp+0D0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+70h],xmm6
-  movdqa      xmm6, [esp+0E0h]
-  punpckhbw   xmm6,xmm1
-  movdqa     [esp+90h],xmm6
-  movdqa      xmm5, [esp+0E0h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa       [esp+0A0h],xmm7
-  punpcklbw   xmm3,xmm1
-  mov         edx,4
-  punpcklbw   xmm2,xmm1
-  movsx       edx,dx
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,[esp+30h]
-  movdqa      [esp+20h],xmm6
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1,[esp+60h]
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6, [esp+20h]
-  movdqa      xmm7, [esp+50h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      [esp+10h],xmm0
-  movdqa      xmm6, [esp+10h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+10h],xmm6
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm6,xmm4
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+30h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1,[esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5,[esp+80h]
-  psubw       xmm5,[esp+90h]
-  pand        xmm6,xmm1
-  pand        xmm6,[esp+40h]
-  movdqa      xmm1,[esp+10h]
-  pand        xmm1,xmm6
-  movdqa      xmm6,[esp+70h]
-  movdqa      [esp+30h],xmm1
-  movdqa      xmm1,[esp+0A0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6,[esp+20h]
-  movdqa      xmm5,[esp+60h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+70h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+80h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+90h]
-  pand        xmm4,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+40h]
-  pand        xmm0,xmm4
-  movdqa      xmm4,[esp+30h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  packuswb    xmm2,xmm1
-  movq        [esi],xmm2
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm3,xmm5
-  movq        [eax],xmm3
-  psrldq      xmm2,8
-  movq        [edi],xmm2
-  pop         edi
-  pop         esi
-  psrldq      xmm3,8
-  movq        [ecx],xmm3
-  pop         ebx
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,0E4h
+    push        ebx
+    push        esi
+    mov         esi, [ebp+1Ch]      ;  pTC
+    movsx       ebx, byte [esi+2]
+    push        edi
+    movsx       di,byte [esi+3]
+    mov         word [esp+0Ch],bx
+    movsx       bx,byte  [esi+1]
+    movsx       esi,byte  [esi]
+    mov         word  [esp+0Eh],si
+    movzx       esi,di
+    movd        xmm1,esi
+    movzx       esi,di
+    movd        xmm2,esi
+    mov         si,word  [esp+0Ch]
+    mov         edx, [ebp + 10h]
+    mov         eax, [ebp + 08h]
+    movzx       edi,si
+    movzx       esi,si
+    mov         ecx, [ebp + 0Ch]
+    movd        xmm4,esi
+    movzx       esi,bx
+    movd        xmm5,esi
+    movd        xmm3,edi
+    movzx       esi,bx
+    movd        xmm6,esi
+    mov         si,word [esp+0Eh]
+    movzx       edi,si
+    movzx       esi,si
+    punpcklwd   xmm6,xmm2
+    pxor        xmm0,xmm0
+    movdqa      [esp+40h],xmm0
+    movd        xmm7,edi
+    movd        xmm0,esi
+    lea         esi,[edx+edx]
+    mov         edi,eax
+    sub         edi,esi
+    punpcklwd   xmm5,xmm1
+    movdqa      xmm1,[esp+40h]
+    punpcklwd   xmm0,xmm4
+    movq        xmm4,[edx+ecx]
+    punpcklwd   xmm7,xmm3
+    movq        xmm3,[eax]
+    punpcklwd   xmm0,xmm6
+    movq        xmm6,[edi]
+    punpcklwd   xmm7,xmm5
+    punpcklwd   xmm0,xmm7
+    mov         edi,ecx
+    sub         edi,esi
+    movdqa      xmm2,xmm1
+    psubw       xmm2,xmm0
+    movdqa      [esp+60h],xmm2
+    movq        xmm2, [edi]
+    punpcklqdq  xmm6,xmm2
+    mov         esi,eax
+    sub         esi,edx
+    movq        xmm7,[esi]
+    mov         edi,ecx
+    sub         edi,edx
+    movq        xmm2,[edi]
+    punpcklqdq  xmm7,xmm2
+    movq        xmm2,[ecx]
+    punpcklqdq  xmm3,xmm2
+    movq        xmm2,[edx+eax]
+    movsx       edx,word [ebp + 14h]
+    punpcklqdq  xmm2,xmm4
+    movdqa      [esp+0E0h],xmm2
+    movd        xmm2,edx
+    movsx       edx,word [ebp + 18h]
+    movdqa      xmm4,xmm2
+    punpcklwd   xmm4,xmm2
+    movd        xmm2,edx
+    movdqa      xmm5,xmm2
+    punpcklwd   xmm5,xmm2
+    pshufd      xmm2,xmm5,0
+    movdqa      [esp+50h],xmm2
+    movdqa      xmm2,xmm6
+    punpcklbw   xmm2,xmm1
+    movdqa      [esp+0D0h],xmm3
+    pshufd      xmm4,xmm4,0
+    movdqa      [esp+30h],xmm2
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+80h],xmm6
+    movdqa      xmm6,[esp+0D0h]
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+70h],xmm6
+    movdqa      xmm6, [esp+0E0h]
+    punpckhbw   xmm6,xmm1
+    movdqa     [esp+90h],xmm6
+    movdqa      xmm5, [esp+0E0h]
+    movdqa      xmm2,xmm7
+    punpckhbw   xmm7,xmm1
+    punpcklbw   xmm5,xmm1
+    movdqa       [esp+0A0h],xmm7
+    punpcklbw   xmm3,xmm1
+    mov         edx,4
+    punpcklbw   xmm2,xmm1
+    movsx       edx,dx
+    movd        xmm6,edx
+    movdqa      xmm7,xmm6
+    punpcklwd   xmm7,xmm6
+    pshufd      xmm6,xmm7,0
+    movdqa      xmm7,[esp+30h]
+    movdqa      [esp+20h],xmm6
+    psubw       xmm7,xmm5
+    movdqa      xmm6,xmm0
+    pcmpgtw     xmm6,xmm1
+    movdqa      xmm1,[esp+60h]
+    movdqa      [esp+40h],xmm6
+    movdqa      xmm6,xmm3
+    psubw       xmm6,xmm2
+    psllw       xmm6,2
+    paddw       xmm6,xmm7
+    paddw       xmm6, [esp+20h]
+    movdqa      xmm7, [esp+50h]
+    psraw       xmm6,3
+    pmaxsw      xmm1,xmm6
+    movdqa      [esp+10h],xmm0
+    movdqa      xmm6, [esp+10h]
+    pminsw      xmm6,xmm1
+    movdqa      [esp+10h],xmm6
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    pabsw       xmm1,xmm1
+    movdqa      xmm6,xmm4
+    pcmpgtw     xmm6,xmm1
+    movdqa      xmm1, [esp+30h]
+    psubw       xmm1,xmm2
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm7,xmm1
+    movdqa      xmm1,[esp+50h]
+    pand        xmm6,xmm7
+    movdqa      xmm7,[esp+50h]
+    psubw       xmm5,xmm3
+    pabsw       xmm5,xmm5
+    pcmpgtw     xmm1,xmm5
+    movdqa      xmm5,[esp+80h]
+    psubw       xmm5,[esp+90h]
+    pand        xmm6,xmm1
+    pand        xmm6,[esp+40h]
+    movdqa      xmm1,[esp+10h]
+    pand        xmm1,xmm6
+    movdqa      xmm6,[esp+70h]
+    movdqa      [esp+30h],xmm1
+    movdqa      xmm1,[esp+0A0h]
+    psubw       xmm6,xmm1
+    psllw       xmm6,2
+    paddw       xmm6,xmm5
+    paddw       xmm6,[esp+20h]
+    movdqa      xmm5,[esp+60h]
+    psraw       xmm6,3
+    pmaxsw      xmm5,xmm6
+    pminsw      xmm0,xmm5
+    movdqa      xmm5,[esp+70h]
+    movdqa      xmm6,xmm1
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm4,xmm6
+    movdqa      xmm6,[esp+80h]
+    psubw       xmm6,xmm1
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+90h]
+    pand        xmm4,xmm7
+    movdqa      xmm7,[esp+50h]
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    pand        xmm4,xmm7
+    pand        xmm4,[esp+40h]
+    pand        xmm0,xmm4
+    movdqa      xmm4,[esp+30h]
+    paddw       xmm2,xmm4
+    paddw       xmm1,xmm0
+    packuswb    xmm2,xmm1
+    movq        [esi],xmm2
+    psubw       xmm3,xmm4
+    psubw       xmm5,xmm0
+    packuswb    xmm3,xmm5
+    movq        [eax],xmm3
+    psrldq      xmm2,8
+    movq        [edi],xmm2
+    pop         edi
+    pop         esi
+    psrldq      xmm3,8
+    movq        [ecx],xmm3
+    pop         ebx
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 ;***************************************************************************
 ;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3601,280 +3601,280 @@
 ;***************************************************************************
 
 WELS_EXTERN     DeblockChromaEq4H_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0C8h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+7Ch]
-  push        edi
-  mov         dword [esp+14h],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+0Ch],edx
-  mov         dword [esp+10h],eax
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword  [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+0Ch]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+10h]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  movsx       ecx,word [ebp+14h]
-  movsx       edx,word [ebp+18h]
-  movdqa      xmm6,[esp+80h]
-  movdqa      xmm4,[esp+90h]
-  movdqa      xmm5,[esp+0A0h]
-  movdqa      xmm7,[esp+0B0h]
-  pxor        xmm0,xmm0
-  movd        xmm1,ecx
-  movdqa      xmm2,xmm1
-  punpcklwd   xmm2,xmm1
-  pshufd      xmm1,xmm2,0
-  movd        xmm2,edx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3,xmm6
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm6,[esp+90h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm6,[esp+0A0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,[esp+0B0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+70h],xmm6
-  punpcklbw   xmm7,xmm0
-  punpcklbw   xmm4,xmm0
-  punpcklbw   xmm5,xmm0
-  punpcklbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm6,xmm4
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  movdqa      xmm0,xmm1
-  pcmpgtw     xmm0,xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm4
-  pabsw       xmm6,xmm6
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+30h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm1,xmm6
-  movdqa      xmm6,[esp+60h]
-  psubw       xmm6,[esp+30h]
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+70h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pand        xmm1,xmm7
-  pcmpgtw     xmm2,xmm6
-  pand        xmm1,xmm2
-  mov         eax,2
-  movsx       ecx,ax
-  movd        xmm2,ecx
-  movdqa      xmm6,xmm2
-  punpcklwd   xmm6,xmm2
-  pshufd      xmm2,xmm6,0
-  movdqa      [esp+20h],xmm2
-  movdqa      xmm2,xmm3
-  paddw       xmm2,xmm3
-  paddw       xmm2,xmm4
-  paddw       xmm2,[esp+50h]
-  paddw       xmm2,[esp+20h]
-  psraw       xmm2,2
-  movdqa      xmm6,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm2,xmm0
-  pandn       xmm2,xmm4
-  por         xmm6,xmm2
-  movdqa      xmm2,[esp+60h]
-  movdqa      xmm7,xmm2
-  paddw       xmm7,xmm2
-  paddw       xmm7,[esp+30h]
-  paddw       xmm7,[esp+70h]
-  paddw       xmm7,[esp+20h]
-  movdqa      xmm4,xmm1
-  movdqa      xmm2,xmm1
-  pandn       xmm2,[esp+30h]
-  psraw       xmm7,2
-  pand        xmm4,xmm7
-  por         xmm4,xmm2
-  movdqa      xmm2,[esp+50h]
-  packuswb    xmm6,xmm4
-  movdqa      [esp+90h],xmm6
-  movdqa      xmm6,xmm2
-  paddw       xmm6,xmm2
-  movdqa      xmm2,[esp+20h]
-  paddw       xmm6,xmm5
-  paddw       xmm6,xmm3
-  movdqa      xmm4,xmm0
-  pandn       xmm0,xmm5
-  paddw       xmm6,xmm2
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  por         xmm4,xmm0
-  movdqa      xmm0,[esp+70h]
-  movdqa      xmm5,xmm0
-  paddw       xmm5,xmm0
-  movdqa      xmm0,[esp+40h]
-  paddw       xmm5,xmm0
-  paddw       xmm5,[esp+60h]
-  movdqa      xmm3,xmm1
-  paddw       xmm5,xmm2
-  psraw       xmm5,2
-  pand        xmm3,xmm5
-  pandn       xmm1,xmm0
-  por         xmm3,xmm1
-  packuswb    xmm4,xmm3
-  movdqa      [esp+0A0h],xmm4
-  mov         esi,dword [esp+10h]
-  movdqa      xmm0,[esi]
-  movdqa      xmm1,[esi+10h]
-  movdqa      xmm2,[esi+20h]
-  movdqa      xmm3,[esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+0Ch]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,0C8h
+    mov         ecx,dword [ebp+8]
+    mov         edx,dword [ebp+0Ch]
+    mov         eax,dword [ebp+10h]
+    sub         ecx,2
+    sub         edx,2
+    push        esi
+    lea         esi,[eax+eax*2]
+    mov         dword [esp+18h],ecx
+    mov         dword [esp+4],edx
+    lea         ecx,[ecx+eax*4]
+    lea         edx,[edx+eax*4]
+    lea         eax,[esp+7Ch]
+    push        edi
+    mov         dword [esp+14h],esi
+    mov         dword [esp+18h],ecx
+    mov         dword [esp+0Ch],edx
+    mov         dword [esp+10h],eax
+    mov         esi,dword [esp+1Ch]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+14h]
+    movd        xmm0,dword [esi]
+    movd        xmm1,dword [esi+ecx]
+    movd        xmm2,dword [esi+ecx*2]
+    movd        xmm3,dword [esi+edx]
+    mov         esi,dword  [esp+8]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [esi+ecx]
+    movd        xmm6,dword [esi+ecx*2]
+    movd        xmm7,dword [esi+edx]
+    punpckldq   xmm0,xmm4
+    punpckldq   xmm1,xmm5
+    punpckldq   xmm2,xmm6
+    punpckldq   xmm3,xmm7
+    mov         esi,dword [esp+18h]
+    mov         edi,dword [esp+0Ch]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [edi]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm0,xmm4
+    movd        xmm4,dword [esi+ecx]
+    movd        xmm5,dword [edi+ecx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm1,xmm4
+    movd        xmm4,dword [esi+ecx*2]
+    movd        xmm5,dword [edi+ecx*2]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm2,xmm4
+    movd        xmm4,dword [esi+edx]
+    movd        xmm5,dword [edi+edx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm3,xmm4
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         edi,dword [esp+10h]
+    movdqa      [edi],xmm0
+    movdqa      [edi+10h],xmm5
+    movdqa      [edi+20h],xmm1
+    movdqa      [edi+30h],xmm6
+    movsx       ecx,word [ebp+14h]
+    movsx       edx,word [ebp+18h]
+    movdqa      xmm6,[esp+80h]
+    movdqa      xmm4,[esp+90h]
+    movdqa      xmm5,[esp+0A0h]
+    movdqa      xmm7,[esp+0B0h]
+    pxor        xmm0,xmm0
+    movd        xmm1,ecx
+    movdqa      xmm2,xmm1
+    punpcklwd   xmm2,xmm1
+    pshufd      xmm1,xmm2,0
+    movd        xmm2,edx
+    movdqa      xmm3,xmm2
+    punpcklwd   xmm3,xmm2
+    pshufd      xmm2,xmm3,0
+    movdqa      xmm3,xmm6
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+60h],xmm6
+    movdqa      xmm6,[esp+90h]
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+30h],xmm6
+    movdqa      xmm6,[esp+0A0h]
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+40h],xmm6
+    movdqa      xmm6,[esp+0B0h]
+    punpckhbw   xmm6,xmm0
+    movdqa      [esp+70h],xmm6
+    punpcklbw   xmm7,xmm0
+    punpcklbw   xmm4,xmm0
+    punpcklbw   xmm5,xmm0
+    punpcklbw   xmm3,xmm0
+    movdqa      [esp+50h],xmm7
+    movdqa      xmm6,xmm4
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    movdqa      xmm0,xmm1
+    pcmpgtw     xmm0,xmm6
+    movdqa      xmm6,xmm3
+    psubw       xmm6,xmm4
+    pabsw       xmm6,xmm6
+    movdqa      xmm7,xmm2
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+50h]
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pand        xmm0,xmm7
+    movdqa      xmm7,xmm2
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+30h]
+    psubw       xmm6,[esp+40h]
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm1,xmm6
+    movdqa      xmm6,[esp+60h]
+    psubw       xmm6,[esp+30h]
+    pabsw       xmm6,xmm6
+    pand        xmm0,xmm7
+    movdqa      xmm7,xmm2
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6,[esp+70h]
+    psubw       xmm6,[esp+40h]
+    pabsw       xmm6,xmm6
+    pand        xmm1,xmm7
+    pcmpgtw     xmm2,xmm6
+    pand        xmm1,xmm2
+    mov         eax,2
+    movsx       ecx,ax
+    movd        xmm2,ecx
+    movdqa      xmm6,xmm2
+    punpcklwd   xmm6,xmm2
+    pshufd      xmm2,xmm6,0
+    movdqa      [esp+20h],xmm2
+    movdqa      xmm2,xmm3
+    paddw       xmm2,xmm3
+    paddw       xmm2,xmm4
+    paddw       xmm2,[esp+50h]
+    paddw       xmm2,[esp+20h]
+    psraw       xmm2,2
+    movdqa      xmm6,xmm0
+    pand        xmm6,xmm2
+    movdqa      xmm2,xmm0
+    pandn       xmm2,xmm4
+    por         xmm6,xmm2
+    movdqa      xmm2,[esp+60h]
+    movdqa      xmm7,xmm2
+    paddw       xmm7,xmm2
+    paddw       xmm7,[esp+30h]
+    paddw       xmm7,[esp+70h]
+    paddw       xmm7,[esp+20h]
+    movdqa      xmm4,xmm1
+    movdqa      xmm2,xmm1
+    pandn       xmm2,[esp+30h]
+    psraw       xmm7,2
+    pand        xmm4,xmm7
+    por         xmm4,xmm2
+    movdqa      xmm2,[esp+50h]
+    packuswb    xmm6,xmm4
+    movdqa      [esp+90h],xmm6
+    movdqa      xmm6,xmm2
+    paddw       xmm6,xmm2
+    movdqa      xmm2,[esp+20h]
+    paddw       xmm6,xmm5
+    paddw       xmm6,xmm3
+    movdqa      xmm4,xmm0
+    pandn       xmm0,xmm5
+    paddw       xmm6,xmm2
+    psraw       xmm6,2
+    pand        xmm4,xmm6
+    por         xmm4,xmm0
+    movdqa      xmm0,[esp+70h]
+    movdqa      xmm5,xmm0
+    paddw       xmm5,xmm0
+    movdqa      xmm0,[esp+40h]
+    paddw       xmm5,xmm0
+    paddw       xmm5,[esp+60h]
+    movdqa      xmm3,xmm1
+    paddw       xmm5,xmm2
+    psraw       xmm5,2
+    pand        xmm3,xmm5
+    pandn       xmm1,xmm0
+    por         xmm3,xmm1
+    packuswb    xmm4,xmm3
+    movdqa      [esp+0A0h],xmm4
+    mov         esi,dword [esp+10h]
+    movdqa      xmm0,[esi]
+    movdqa      xmm1,[esi+10h]
+    movdqa      xmm2,[esi+20h]
+    movdqa      xmm3,[esi+30h]
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         esi,dword [esp+1Ch]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+14h]
+    mov         edi,dword [esp+8]
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         esi,dword [esp+18h]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         edi,dword [esp+0Ch]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    pop         edi
+    pop         esi
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 ;*******************************************************************************
 ;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
@@ -3882,308 +3882,308 @@
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockChromaLt4H_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,108h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+10h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+6Ch]
-  push        edi
-  mov         dword [esp+0Ch],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+10h],edx
-  mov         dword [esp+1Ch],eax
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+10h]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+1Ch]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  mov         eax,dword [ebp+1Ch]
-  movsx       cx,byte [eax+3]
-  movsx       dx,byte [eax+2]
-  movsx       si,byte [eax+1]
-  movsx       ax,byte [eax]
-  movzx       edi,cx
-  movzx       ecx,cx
-  movd        xmm2,ecx
-  movzx       ecx,dx
-  movzx       edx,dx
-  movd        xmm3,ecx
-  movd        xmm4,edx
-  movzx       ecx,si
-  movzx       edx,si
-  movd        xmm5,ecx
-  pxor        xmm0,xmm0
-  movd        xmm6,edx
-  movzx       ecx,ax
-  movdqa      [esp+60h],xmm0
-  movzx       edx,ax
-  movsx       eax,word [ebp+14h]
-  punpcklwd   xmm6,xmm2
-  movd        xmm1,edi
-  movd        xmm7,ecx
-  movsx       ecx,word [ebp+18h]
-  movd        xmm0,edx
-  punpcklwd   xmm7,xmm3
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+60h]
-  punpcklwd   xmm7,xmm5
-  movdqa      xmm5,[esp+0A0h]
-  punpcklwd   xmm0,xmm4
-  punpcklwd   xmm0,xmm6
-  movdqa      xmm6, [esp+70h]
-  punpcklwd   xmm0,xmm7
-  movdqa      xmm7,[esp+80h]
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+0D0h],xmm2
-  movd        xmm2,eax
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm4,xmm3,0
-  movd        xmm2,ecx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3, [esp+90h]
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+40h],xmm2
-  movdqa      [esp+0B0h],xmm6
-  movdqa      xmm6,[esp+90h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm2,xmm1
-  punpcklbw   xmm3,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa      [esp+0F0h],xmm7
-  movdqa      [esp+0C0h],xmm6
-  movdqa      xmm6, [esp+0A0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+0E0h],xmm6
-  mov         edx,4
-  movsx       eax,dx
-  movd        xmm6,eax
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm7, [esp+40h]
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm1, [esp+0D0h]
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6,[esp+30h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      xmm7,[esp+50h]
-  movdqa      [esp+20h],xmm0
-  movdqa      xmm6, [esp+20h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+20h],xmm6
-  movdqa      xmm6,xmm4
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+40h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1, [esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5, [esp+0B0h]
-  psubw       xmm5,[esp+0E0h]
-  pand        xmm6,xmm1
-  pand        xmm6, [esp+60h]
-  movdqa      xmm1, [esp+20h]
-  pand        xmm1,xmm6
-  movdqa      xmm6, [esp+0C0h]
-  movdqa      [esp+40h],xmm1
-  movdqa      xmm1, [esp+0F0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6, [esp+30h]
-  movdqa      xmm5, [esp+0D0h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+0C0h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+0B0h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6, [esp+0E0h]
-  pand        xmm4,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+60h]
-  pand        xmm0,xmm4
-  movdqa      xmm4, [esp+40h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm2,xmm1
-  packuswb    xmm3,xmm5
-  movdqa      [esp+80h],xmm2
-  movdqa      [esp+90h],xmm3
-  mov         esi,dword [esp+1Ch]
-  movdqa      xmm0, [esi]
-  movdqa      xmm1, [esi+10h]
-  movdqa      xmm2, [esi+20h]
-  movdqa      xmm3, [esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+10h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
+    push        ebp
+    mov         ebp,esp
+    and         esp,0FFFFFFF0h
+    sub         esp,108h
+    mov         ecx,dword [ebp+8]
+    mov         edx,dword [ebp+0Ch]
+    mov         eax,dword [ebp+10h]
+    sub         ecx,2
+    sub         edx,2
+    push        esi
+    lea         esi,[eax+eax*2]
+    mov         dword [esp+10h],ecx
+    mov         dword [esp+4],edx
+    lea         ecx,[ecx+eax*4]
+    lea         edx,[edx+eax*4]
+    lea         eax,[esp+6Ch]
+    push        edi
+    mov         dword [esp+0Ch],esi
+    mov         dword [esp+18h],ecx
+    mov         dword [esp+10h],edx
+    mov         dword [esp+1Ch],eax
+    mov         esi,dword [esp+14h]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+0Ch]
+    movd        xmm0,dword [esi]
+    movd        xmm1,dword [esi+ecx]
+    movd        xmm2,dword [esi+ecx*2]
+    movd        xmm3,dword [esi+edx]
+    mov         esi,dword [esp+8]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [esi+ecx]
+    movd        xmm6,dword [esi+ecx*2]
+    movd        xmm7,dword [esi+edx]
+    punpckldq   xmm0,xmm4
+    punpckldq   xmm1,xmm5
+    punpckldq   xmm2,xmm6
+    punpckldq   xmm3,xmm7
+    mov         esi,dword [esp+18h]
+    mov         edi,dword [esp+10h]
+    movd        xmm4,dword [esi]
+    movd        xmm5,dword [edi]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm0,xmm4
+    movd        xmm4,dword [esi+ecx]
+    movd        xmm5,dword [edi+ecx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm1,xmm4
+    movd        xmm4,dword [esi+ecx*2]
+    movd        xmm5,dword [edi+ecx*2]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm2,xmm4
+    movd        xmm4,dword [esi+edx]
+    movd        xmm5,dword [edi+edx]
+    punpckldq   xmm4,xmm5
+    punpcklqdq  xmm3,xmm4
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         edi,dword [esp+1Ch]
+    movdqa      [edi],xmm0
+    movdqa      [edi+10h],xmm5
+    movdqa      [edi+20h],xmm1
+    movdqa      [edi+30h],xmm6
+    mov         eax,dword [ebp+1Ch]
+    movsx       cx,byte [eax+3]
+    movsx       dx,byte [eax+2]
+    movsx       si,byte [eax+1]
+    movsx       ax,byte [eax]
+    movzx       edi,cx
+    movzx       ecx,cx
+    movd        xmm2,ecx
+    movzx       ecx,dx
+    movzx       edx,dx
+    movd        xmm3,ecx
+    movd        xmm4,edx
+    movzx       ecx,si
+    movzx       edx,si
+    movd        xmm5,ecx
+    pxor        xmm0,xmm0
+    movd        xmm6,edx
+    movzx       ecx,ax
+    movdqa      [esp+60h],xmm0
+    movzx       edx,ax
+    movsx       eax,word [ebp+14h]
+    punpcklwd   xmm6,xmm2
+    movd        xmm1,edi
+    movd        xmm7,ecx
+    movsx       ecx,word [ebp+18h]
+    movd        xmm0,edx
+    punpcklwd   xmm7,xmm3
+    punpcklwd   xmm5,xmm1
+    movdqa      xmm1,[esp+60h]
+    punpcklwd   xmm7,xmm5
+    movdqa      xmm5,[esp+0A0h]
+    punpcklwd   xmm0,xmm4
+    punpcklwd   xmm0,xmm6
+    movdqa      xmm6, [esp+70h]
+    punpcklwd   xmm0,xmm7
+    movdqa      xmm7,[esp+80h]
+    movdqa      xmm2,xmm1
+    psubw       xmm2,xmm0
+    movdqa      [esp+0D0h],xmm2
+    movd        xmm2,eax
+    movdqa      xmm3,xmm2
+    punpcklwd   xmm3,xmm2
+    pshufd      xmm4,xmm3,0
+    movd        xmm2,ecx
+    movdqa      xmm3,xmm2
+    punpcklwd   xmm3,xmm2
+    pshufd      xmm2,xmm3,0
+    movdqa      xmm3, [esp+90h]
+    movdqa      [esp+50h],xmm2
+    movdqa      xmm2,xmm6
+    punpcklbw   xmm2,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+40h],xmm2
+    movdqa      [esp+0B0h],xmm6
+    movdqa      xmm6,[esp+90h]
+    movdqa      xmm2,xmm7
+    punpckhbw   xmm7,xmm1
+    punpckhbw   xmm6,xmm1
+    punpcklbw   xmm2,xmm1
+    punpcklbw   xmm3,xmm1
+    punpcklbw   xmm5,xmm1
+    movdqa      [esp+0F0h],xmm7
+    movdqa      [esp+0C0h],xmm6
+    movdqa      xmm6, [esp+0A0h]
+    punpckhbw   xmm6,xmm1
+    movdqa      [esp+0E0h],xmm6
+    mov         edx,4
+    movsx       eax,dx
+    movd        xmm6,eax
+    movdqa      xmm7,xmm6
+    punpcklwd   xmm7,xmm6
+    pshufd      xmm6,xmm7,0
+    movdqa      [esp+30h],xmm6
+    movdqa      xmm7, [esp+40h]
+    psubw       xmm7,xmm5
+    movdqa      xmm6,xmm0
+    pcmpgtw     xmm6,xmm1
+    movdqa      [esp+60h],xmm6
+    movdqa      xmm1, [esp+0D0h]
+    movdqa      xmm6,xmm3
+    psubw       xmm6,xmm2
+    psllw       xmm6,2
+    paddw       xmm6,xmm7
+    paddw       xmm6,[esp+30h]
+    psraw       xmm6,3
+    pmaxsw      xmm1,xmm6
+    movdqa      xmm7,[esp+50h]
+    movdqa      [esp+20h],xmm0
+    movdqa      xmm6, [esp+20h]
+    pminsw      xmm6,xmm1
+    movdqa      [esp+20h],xmm6
+    movdqa      xmm6,xmm4
+    movdqa      xmm1,xmm2
+    psubw       xmm1,xmm3
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm6,xmm1
+    movdqa      xmm1, [esp+40h]
+    psubw       xmm1,xmm2
+    pabsw       xmm1,xmm1
+    pcmpgtw     xmm7,xmm1
+    movdqa      xmm1, [esp+50h]
+    pand        xmm6,xmm7
+    movdqa      xmm7, [esp+50h]
+    psubw       xmm5,xmm3
+    pabsw       xmm5,xmm5
+    pcmpgtw     xmm1,xmm5
+    movdqa      xmm5, [esp+0B0h]
+    psubw       xmm5,[esp+0E0h]
+    pand        xmm6,xmm1
+    pand        xmm6, [esp+60h]
+    movdqa      xmm1, [esp+20h]
+    pand        xmm1,xmm6
+    movdqa      xmm6, [esp+0C0h]
+    movdqa      [esp+40h],xmm1
+    movdqa      xmm1, [esp+0F0h]
+    psubw       xmm6,xmm1
+    psllw       xmm6,2
+    paddw       xmm6,xmm5
+    paddw       xmm6, [esp+30h]
+    movdqa      xmm5, [esp+0D0h]
+    psraw       xmm6,3
+    pmaxsw      xmm5,xmm6
+    pminsw      xmm0,xmm5
+    movdqa      xmm5,[esp+0C0h]
+    movdqa      xmm6,xmm1
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm4,xmm6
+    movdqa      xmm6,[esp+0B0h]
+    psubw       xmm6,xmm1
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    movdqa      xmm6, [esp+0E0h]
+    pand        xmm4,xmm7
+    movdqa      xmm7, [esp+50h]
+    psubw       xmm6,xmm5
+    pabsw       xmm6,xmm6
+    pcmpgtw     xmm7,xmm6
+    pand        xmm4,xmm7
+    pand        xmm4,[esp+60h]
+    pand        xmm0,xmm4
+    movdqa      xmm4, [esp+40h]
+    paddw       xmm2,xmm4
+    paddw       xmm1,xmm0
+    psubw       xmm3,xmm4
+    psubw       xmm5,xmm0
+    packuswb    xmm2,xmm1
+    packuswb    xmm3,xmm5
+    movdqa      [esp+80h],xmm2
+    movdqa      [esp+90h],xmm3
+    mov         esi,dword [esp+1Ch]
+    movdqa      xmm0, [esi]
+    movdqa      xmm1, [esi+10h]
+    movdqa      xmm2, [esi+20h]
+    movdqa      xmm3, [esi+30h]
+    movdqa      xmm6,xmm0
+    punpcklbw   xmm0,xmm1
+    punpckhbw   xmm6,xmm1
+    movdqa      xmm7,xmm2
+    punpcklbw   xmm2,xmm3
+    punpckhbw   xmm7,xmm3
+    movdqa      xmm4,xmm0
+    movdqa      xmm5,xmm6
+    punpcklwd   xmm0,xmm2
+    punpckhwd   xmm4,xmm2
+    punpcklwd   xmm6,xmm7
+    punpckhwd   xmm5,xmm7
+    movdqa      xmm1,xmm0
+    movdqa      xmm2,xmm4
+    punpckldq   xmm0,xmm6
+    punpckhdq   xmm1,xmm6
+    punpckldq   xmm4,xmm5
+    punpckhdq   xmm2,xmm5
+    movdqa      xmm5,xmm0
+    movdqa      xmm6,xmm1
+    punpcklqdq  xmm0,xmm4
+    punpckhqdq  xmm5,xmm4
+    punpcklqdq  xmm1,xmm2
+    punpckhqdq  xmm6,xmm2
+    mov         esi,dword [esp+14h]
+    mov         ecx,dword [ebp+10h]
+    mov         edx,dword [esp+0Ch]
+    mov         edi,dword [esp+8]
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         esi,dword [esp+18h]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    movd        dword [esi],xmm0
+    movd        dword [esi+ecx],xmm5
+    movd        dword [esi+ecx*2],xmm1
+    movd        dword [esi+edx],xmm6
+    psrldq      xmm0,4
+    psrldq      xmm5,4
+    psrldq      xmm1,4
+    psrldq      xmm6,4
+    mov         edi,dword [esp+10h]
+    movd        dword [edi],xmm0
+    movd        dword [edi+ecx],xmm5
+    movd        dword [edi+ecx*2],xmm1
+    movd        dword [edi+edx],xmm6
+    pop         edi
+    pop         esi
+    mov         esp,ebp
+    pop         ebp
+    ret
 
 
 
@@ -4194,385 +4194,385 @@
 
 
 WELS_EXTERN  DeblockLumaLt4V_ssse3
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
+    push    ebp
+    mov ebp, esp
+    and esp, -16                ; fffffff0H
+    sub esp, 420                ; 000001a4H
+    mov eax, dword [ebp+8]
+    mov ecx, dword [ebp+12]
 
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
+    pxor    xmm0, xmm0
+    push    ebx
+    mov edx, dword [ebp+24]
+    movdqa  [esp+424-384], xmm0
+    push    esi
 
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
+    lea esi, [ecx+ecx*2]
+    push    edi
+    mov edi, eax
+    sub edi, esi
+    movdqa  xmm0, [edi]
 
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
+    lea esi, [ecx+ecx]
+    movdqa  [esp+432-208], xmm0
+    mov edi, eax
+    sub edi, esi
+    movdqa  xmm0, [edi]
+    movdqa  [esp+448-208], xmm0
 
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
+    mov ebx, eax
+    sub ebx, ecx
+    movdqa  xmm0, [ebx]
+    movdqa  [esp+464-208], xmm0
 
-	movdqa	xmm0, [eax]
+    movdqa  xmm0, [eax]
 
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
+    add ecx, eax
+    movdqa  [esp+480-208], xmm0
+    movdqa  xmm0, [ecx]
+    mov dword [esp+432-404], ecx
 
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
+    movsx   ecx, word [ebp+16]
+    movdqa  [esp+496-208], xmm0
+    movdqa  xmm0, [esi+eax]
 
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
+    movsx   si, byte [edx]
+    movdqa  [esp+512-208], xmm0
+    movd    xmm0, ecx
+    movsx   ecx, word [ebp+20]
+    movdqa  xmm1, xmm0
+    punpcklwd xmm1, xmm0
+    pshufd  xmm0, xmm1, 0
+    movdqa  [esp+432-112], xmm0
+    movd    xmm0, ecx
+    movsx   cx, byte [edx+1]
+    movdqa  xmm1, xmm0
+    punpcklwd xmm1, xmm0
+    mov dword [esp+432-408], ebx
+    movzx   ebx, cx
+    pshufd  xmm0, xmm1, 0
+    movd    xmm1, ebx
+    movzx   ebx, cx
+    movd    xmm2, ebx
+    movzx   ebx, cx
+    movzx   ecx, cx
+    movd    xmm4, ecx
+    movzx   ecx, si
+    movd    xmm5, ecx
+    movzx   ecx, si
+    movd    xmm6, ecx
+    movzx   ecx, si
+    movd    xmm7, ecx
+    movzx   ecx, si
+    movdqa  [esp+432-336], xmm0
+    movd    xmm0, ecx
 
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
+    movsx   cx, byte [edx+3]
+    movsx   dx, byte [edx+2]
+    movd    xmm3, ebx
+    punpcklwd xmm0, xmm4
+    movzx   esi, cx
+    punpcklwd xmm6, xmm2
+    punpcklwd xmm5, xmm1
+    punpcklwd xmm0, xmm6
+    punpcklwd xmm7, xmm3
+    punpcklwd xmm7, xmm5
+    punpcklwd xmm0, xmm7
+    movdqa  [esp+432-400], xmm0
+    movd    xmm0, esi
+    movzx   esi, cx
+    movd    xmm2, esi
+    movzx   esi, cx
+    movzx   ecx, cx
+    movd    xmm4, ecx
+    movzx   ecx, dx
+    movd    xmm3, esi
+    movd    xmm5, ecx
+    punpcklwd xmm5, xmm0
 
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
+    movdqa  xmm0, [esp+432-384]
+    movzx   ecx, dx
+    movd    xmm6, ecx
+    movzx   ecx, dx
+    movzx   edx, dx
+    punpcklwd xmm6, xmm2
+    movd    xmm7, ecx
+    movd    xmm1, edx
 
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
+    movdqa  xmm2, [esp+448-208]
+    punpcklbw xmm2, xmm0
 
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
+    mov ecx, 4
+    movsx   edx, cx
+    punpcklwd xmm7, xmm3
+    punpcklwd xmm7, xmm5
+    movdqa  xmm5, [esp+496-208]
+    movdqa  xmm3, [esp+464-208]
+    punpcklbw xmm5, xmm0
+    movdqa  [esp+432-240], xmm5
+    movdqa  xmm5, [esp+512-208]
+    punpcklbw xmm5, xmm0
+    movdqa  [esp+432-352], xmm5
+    punpcklwd xmm1, xmm4
+    movdqa  xmm4, [esp+432-208]
+    punpcklwd xmm1, xmm6
+    movdqa  xmm6, [esp+480-208]
+    punpcklwd xmm1, xmm7
+    punpcklbw xmm6, xmm0
+    punpcklbw xmm3, xmm0
+    punpcklbw xmm4, xmm0
+    movdqa  xmm7, xmm3
+    psubw   xmm7, xmm4
+    pabsw   xmm7, xmm7
+    movdqa  [esp+432-272], xmm4
+    movdqa  xmm4, [esp+432-336]
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-288], xmm5
+    movdqa  xmm7, xmm6
+    psubw   xmm7, [esp+432-352]
+    pabsw   xmm7, xmm7
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-256], xmm5
+    movdqa  xmm5, xmm3
+    pavgw   xmm5, xmm6
+    movdqa  [esp+432-304], xmm5
+    movdqa  xmm5, [esp+432-400]
+    psubw   xmm5, [esp+432-288]
+    psubw   xmm5, [esp+432-256]
+    movdqa  [esp+432-224], xmm5
+    movdqa  xmm5, xmm6
+    psubw   xmm5, xmm3
+    movdqa  [esp+432-32], xmm6
+    psubw   xmm6, [esp+432-240]
+    movdqa  xmm7, xmm5
+    movdqa  [esp+432-384], xmm5
+    movdqa  xmm5, [esp+432-112]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm5, xmm7
+    pabsw   xmm6, xmm6
+    movdqa  xmm7, xmm4
+    pcmpgtw xmm7, xmm6
 
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
+    pand    xmm5, xmm7
+    movdqa  xmm6, xmm3
+    psubw   xmm6, xmm2
+    pabsw   xmm6, xmm6
+    movdqa  xmm7, xmm4
+    pcmpgtw xmm7, xmm6
+    movdqa  xmm6, [esp+432-400]
+    pand    xmm5, xmm7
+    movdqa  xmm7, xmm6
+    pcmpeqw xmm6, xmm0
+    pcmpgtw xmm7, xmm0
+    por xmm7, xmm6
+    pand    xmm5, xmm7
+    movdqa  [esp+432-320], xmm5
+    movd    xmm5, edx
+    movdqa  xmm6, xmm5
+    punpcklwd xmm6, xmm5
+    pshufd  xmm5, xmm6, 0
+    movdqa  [esp+432-336], xmm5
+    movdqa  xmm5, [esp+432-224]
+    movdqa  [esp+432-368], xmm5
+    movdqa  xmm6, xmm0
+    psubw   xmm6, xmm5
+    movdqa  xmm5, [esp+432-384]
+    psllw   xmm5, 2
+    movdqa  xmm7, xmm2
+    psubw   xmm7, [esp+432-240]
+    paddw   xmm7, xmm5
+    paddw   xmm7, [esp+432-336]
+    movdqa  xmm5, [esp+432-368]
+    psraw   xmm7, 3
+    pmaxsw  xmm6, xmm7
+    pminsw  xmm5, xmm6
 
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
+    pand    xmm5, [esp+432-320]
+    movdqa  xmm6, [esp+432-400]
+    movdqa  [esp+432-64], xmm5
+    movdqa  [esp+432-384], xmm6
+    movdqa  xmm5, xmm0
+    psubw   xmm5, xmm6
+    movdqa  [esp+432-368], xmm5
+    movdqa  xmm6, xmm5
+    movdqa  xmm5, [esp+432-272]
+    paddw   xmm5, [esp+432-304]
+    movdqa  xmm7, xmm2
+    paddw   xmm7, xmm2
+    psubw   xmm5, xmm7
+    psraw   xmm5, 1
+    pmaxsw  xmm6, xmm5
+    movdqa  xmm5, [esp+432-384]
+    pminsw  xmm5, xmm6
 
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
+    pand    xmm5, [esp+432-320]
+    pand    xmm5, [esp+432-288]
+    movdqa  xmm6, [esp+432-240]
+    movdqa  [esp+432-96], xmm5
+    movdqa  xmm5, [esp+432-352]
+    paddw   xmm5, [esp+432-304]
+    movdqa  xmm7, xmm6
+    paddw   xmm7, xmm6
+    movdqa  xmm6, [esp+432-368]
+    psubw   xmm5, xmm7
 
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
+    movdqa  xmm7, [esp+496-208]
+    psraw   xmm5, 1
+    pmaxsw  xmm6, xmm5
+    movdqa  xmm5, [esp+432-400]
+    pminsw  xmm5, xmm6
+    pand    xmm5, [esp+432-320]
+    pand    xmm5, [esp+432-256]
+    movdqa  xmm6, [esp+448-208]
+    punpckhbw xmm7, xmm0
+    movdqa  [esp+432-352], xmm7
 
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
+    movdqa  xmm7, [esp+512-208]
+    punpckhbw xmm6, xmm0
+    movdqa  [esp+432-48], xmm5
+    movdqa  xmm5, [esp+432-208]
+    movdqa  [esp+432-368], xmm6
+    movdqa  xmm6, [esp+464-208]
+    punpckhbw xmm7, xmm0
+    punpckhbw xmm5, xmm0
+    movdqa  [esp+432-384], xmm7
+    punpckhbw xmm6, xmm0
+    movdqa  [esp+432-400], xmm6
 
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
+    movdqa  xmm7, [esp+432-400]
+    movdqa  xmm6, [esp+480-208]
+    psubw   xmm7, xmm5
+    movdqa  [esp+432-16], xmm5
+    pabsw   xmm7, xmm7
+    punpckhbw xmm6, xmm0
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-288], xmm5
 
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
+    movdqa  xmm7, xmm6
+    psubw   xmm7, [esp+432-384]
+    pabsw   xmm7, xmm7
+    movdqa  xmm5, xmm4
+    pcmpgtw xmm5, xmm7
+    movdqa  [esp+432-256], xmm5
 
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
+    movdqa  xmm5, [esp+432-400]
+    movdqa  [esp+432-80], xmm6
+    pavgw   xmm5, xmm6
+    movdqa  [esp+432-304], xmm5
 
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
+    movdqa  xmm5, xmm1
+    psubw   xmm5, [esp+432-288]
+    psubw   xmm5, [esp+432-256]
+    movdqa  [esp+432-224], xmm5
+    movdqa  xmm5, xmm6
+    psubw   xmm5, [esp+432-400]
+    psubw   xmm6, [esp+432-352]
+    movdqa  [esp+432-272], xmm5
+    movdqa  xmm7, xmm5
+    movdqa  xmm5, [esp+432-112]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm5, xmm7
+    movdqa  xmm7, xmm4
+    pabsw   xmm6, xmm6
+    pcmpgtw xmm7, xmm6
+    movdqa  xmm6, [esp+432-368]
 
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
+    pand    xmm5, xmm7
+    movdqa  xmm7, [esp+432-400]
+    psubw   xmm7, xmm6
+    psubw   xmm6, [esp+432-352]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm4, xmm7
+    pand    xmm5, xmm4
 
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
+    paddw   xmm2, [esp+432-96]
+    movdqa  xmm4, xmm1
+    pcmpgtw xmm4, xmm0
+    movdqa  xmm7, xmm1
+    pcmpeqw xmm7, xmm0
+    por xmm4, xmm7
+    pand    xmm5, xmm4
+    movdqa  xmm4, [esp+432-224]
+    movdqa  [esp+432-320], xmm5
+    movdqa  xmm5, [esp+432-272]
+    movdqa  xmm7, xmm0
+    psubw   xmm7, xmm4
+    psubw   xmm0, xmm1
+    psllw   xmm5, 2
+    paddw   xmm6, xmm5
+    paddw   xmm6, [esp+432-336]
+    movdqa  xmm5, [esp+432-368]
+    movdqa  [esp+432-336], xmm0
+    psraw   xmm6, 3
+    pmaxsw  xmm7, xmm6
+    pminsw  xmm4, xmm7
+    pand    xmm4, [esp+432-320]
+    movdqa  xmm6, xmm0
+    movdqa  xmm0, [esp+432-16]
+    paddw   xmm0, [esp+432-304]
+    movdqa  [esp+432-272], xmm4
+    movdqa  xmm4, [esp+432-368]
+    paddw   xmm4, xmm4
+    psubw   xmm0, xmm4
 
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
+    movdqa  xmm4, [esp+432-64]
+    psraw   xmm0, 1
+    pmaxsw  xmm6, xmm0
+    movdqa  xmm0, [esp+432-400]
+    movdqa  xmm7, xmm1
+    pminsw  xmm7, xmm6
+    movdqa  xmm6, [esp+432-320]
+    pand    xmm7, xmm6
+    pand    xmm7, [esp+432-288]
+    paddw   xmm5, xmm7
+    packuswb xmm2, xmm5
+    movdqa  xmm5, [esp+432-272]
+    paddw   xmm0, xmm5
+    paddw   xmm3, xmm4
+    packuswb xmm3, xmm0
 
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
+    movdqa  xmm0, [esp+432-32]
+    psubw   xmm0, xmm4
+    movdqa  xmm4, [esp+432-80]
+    psubw   xmm4, xmm5
 
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
+    movdqa  xmm5, [esp+432-240]
+    paddw   xmm5, [esp+432-48]
+    packuswb xmm0, xmm4
+    movdqa  xmm4, [esp+432-384]
+    paddw   xmm4, [esp+432-304]
+    movdqa  [esp+480-208], xmm0
+    movdqa  xmm0, [esp+432-352]
+    movdqa  xmm7, xmm0
+    paddw   xmm0, xmm0
 
-	mov	ecx, dword [esp+432-408]
+    mov ecx, dword [esp+432-408]
 
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
+    mov edx, dword [esp+432-404]
+    psubw   xmm4, xmm0
+    movdqa  xmm0, [esp+432-336]
+    movdqa  [edi], xmm2
+    psraw   xmm4, 1
+    pmaxsw  xmm0, xmm4
+    pminsw  xmm1, xmm0
+    movdqa  xmm0, [esp+480-208]
 
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
+    pop edi
+    pand    xmm1, xmm6
+    pand    xmm1, [esp+428-256]
+    movdqa  [ecx], xmm3
+    paddw   xmm7, xmm1
+    pop esi
+    packuswb xmm5, xmm7
+    movdqa  [eax], xmm0
+    movdqa  [edx], xmm5
+    pop ebx
+    mov esp, ebp
+    pop ebp
+    ret
 
 
 ;*******************************************************************************
@@ -4583,542 +4583,542 @@
 
 WELS_EXTERN  DeblockLumaEq4V_ssse3
 
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
+    push    ebp
+    mov ebp, esp
+    and esp, -16                ; fffffff0H
+    sub esp, 628                ; 00000274H
+    mov eax, dword [ebp+8]
+    mov ecx, dword [ebp+12]
+    push    ebx
+    push    esi
 
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
+    lea edx, [ecx*4]
+    pxor    xmm0, xmm0
+    movdqa  xmm2, xmm0
 
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
+    movdqa  xmm0, [ecx+eax]
+    mov esi, eax
+    sub esi, edx
+    movdqa  xmm3, [esi]
+    movdqa  xmm5, [eax]
+    push    edi
+    lea edi, [ecx+ecx]
+    lea ebx, [ecx+ecx*2]
+    mov dword [esp+640-600], edi
+    mov esi, eax
+    sub esi, edi
+    movdqa  xmm1, [esi]
+    movdqa   [esp+720-272], xmm0
+    mov edi, eax
+    sub edi, ecx
+    movdqa  xmm4, [edi]
+    add ecx, eax
+    mov dword [esp+640-596], ecx
 
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
+    mov ecx, dword [esp+640-600]
+    movdqa  xmm0, [ecx+eax]
+    movdqa   [esp+736-272], xmm0
 
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
+    movdqa  xmm0, [eax+ebx]
+    mov edx, eax
+    sub edx, ebx
 
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
+    movsx   ebx, word [ebp+16]
+    movdqa  xmm6, [edx]
+    add ecx, eax
+    movdqa   [esp+752-272], xmm0
+    movd    xmm0, ebx
 
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
+    movsx   ebx, word [ebp+20]
+    movdqa  xmm7, xmm0
+    punpcklwd xmm7, xmm0
+    pshufd  xmm0, xmm7, 0
+    movdqa   [esp+640-320], xmm0
+    movd    xmm0, ebx
+    movdqa  xmm7, xmm0
+    punpcklwd xmm7, xmm0
+    pshufd  xmm0, xmm7, 0
 
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
+    movdqa  xmm7, [esp+736-272]
+    punpcklbw xmm7, xmm2
+    movdqa   [esp+640-416], xmm7
+    movdqa   [esp+640-512], xmm0
+    movdqa  xmm0, xmm1
+    movdqa   [esp+672-272], xmm1
+    movdqa  xmm1, xmm4
+    movdqa   [esp+704-272], xmm5
+    punpcklbw xmm5, xmm2
+    punpcklbw xmm1, xmm2
 
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
+    movdqa  xmm7, xmm5
+    psubw   xmm7, xmm1
+    pabsw   xmm7, xmm7
+    movdqa   [esp+640-560], xmm7
+    punpcklbw xmm0, xmm2
+    movdqa   [esp+688-272], xmm4
+    movdqa  xmm4, [esp+720-272]
+    movdqa   [esp+640-480], xmm0
 
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
+    movdqa  xmm7, xmm1
+    psubw   xmm7, xmm0
 
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
+    movdqa  xmm0, [esp+640-512]
+    pabsw   xmm7, xmm7
+    punpcklbw xmm4, xmm2
+    pcmpgtw xmm0, xmm7
+    movdqa   [esp+640-384], xmm4
+    movdqa  xmm7, xmm5
+    psubw   xmm7, xmm4
+    movdqa  xmm4, [esp+640-512]
+    movdqa   [esp+656-272], xmm6
+    punpcklbw xmm6, xmm2
+    pabsw   xmm7, xmm7
+    movdqa   [esp+640-48], xmm2
+    movdqa   [esp+640-368], xmm6
+    movdqa   [esp+640-144], xmm1
+    movdqa   [esp+640-400], xmm5
+    pcmpgtw xmm4, xmm7
+    pand    xmm0, xmm4
+    movdqa  xmm4, [esp+640-320]
+    pcmpgtw xmm4, [esp+640-560]
+    pand    xmm0, xmm4
 
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
+    mov ebx, 2
+    movsx   ebx, bx
+    movd    xmm4, ebx
+    movdqa  xmm7, xmm4
+    punpcklwd xmm7, xmm4
+    movdqa  xmm4, [esp+640-320]
+    psraw   xmm4, 2
+    pshufd  xmm7, xmm7, 0
+    paddw   xmm4, xmm7
+    movdqa   [esp+640-576], xmm4
+    pcmpgtw xmm4, [esp+640-560]
+    movdqa   [esp+640-560], xmm4
 
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
+    movdqa  xmm4, [esp+640-512]
+    movdqa   [esp+640-624], xmm7
+    movdqa  xmm7, xmm1
+    psubw   xmm7, xmm6
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm4, xmm7
 
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
+    pand    xmm4, [esp+640-560]
+    movdqa   [esp+640-544], xmm4
+    movdqa  xmm4, [esp+640-512]
+    movdqa  xmm7, xmm5
+    psubw   xmm7, [esp+640-416]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm4, xmm7
 
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
+    pand    xmm4, [esp+640-560]
+    movdqa   [esp+640-560], xmm4
 
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
+    movdqa  xmm4, [esp+640-544]
+    pandn   xmm4, xmm6
+    movdqa   [esp+640-16], xmm4
+    mov ebx, 4
+    movsx   ebx, bx
+    movd    xmm4, ebx
+    movdqa  xmm7, xmm4
+    punpcklwd xmm7, xmm4
+    movdqa  xmm4, xmm3
+    punpcklbw xmm4, xmm2
+    psllw   xmm4, 1
+    paddw   xmm4, xmm6
+    paddw   xmm4, xmm6
+    paddw   xmm4, xmm6
+    paddw   xmm4, [esp+640-480]
 
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
+    movdqa  xmm6, [esp+640-560]
+    pshufd  xmm7, xmm7, 0
+    paddw   xmm4, xmm1
+    movdqa   [esp+640-592], xmm7
+    paddw   xmm4, xmm5
+    paddw   xmm4, xmm7
+    movdqa  xmm7, [esp+640-416]
+    pandn   xmm6, xmm7
+    movdqa   [esp+640-80], xmm6
+    movdqa  xmm6, [esp+752-272]
+    punpcklbw xmm6, xmm2
+    psllw   xmm6, 1
+    paddw   xmm6, xmm7
+    paddw   xmm6, xmm7
+    paddw   xmm6, xmm7
+    paddw   xmm6, [esp+640-384]
 
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
+    movdqa  xmm7, [esp+640-480]
+    paddw   xmm6, xmm5
+    paddw   xmm6, xmm1
+    paddw   xmm6, [esp+640-592]
+    psraw   xmm6, 3
+    pand    xmm6, [esp+640-560]
+    movdqa   [esp+640-112], xmm6
+    movdqa  xmm6, [esp+640-544]
+    pandn   xmm6, xmm7
+    movdqa   [esp+640-336], xmm6
+    movdqa  xmm6, [esp+640-544]
+    movdqa   [esp+640-528], xmm6
+    movdqa  xmm6, [esp+640-368]
+    paddw   xmm6, xmm7
+    movdqa  xmm7, xmm1
+    psraw   xmm4, 3
+    pand    xmm4, [esp+640-544]
+    paddw   xmm7, xmm5
+    paddw   xmm6, xmm7
+    paddw   xmm6, [esp+640-624]
+    movdqa  xmm7, [esp+640-528]
 
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
+    paddw   xmm5, xmm1
+    psraw   xmm6, 2
+    pand    xmm7, xmm6
 
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
+    movdqa  xmm6, [esp+640-384]
+    movdqa   [esp+640-64], xmm7
+    movdqa  xmm7, [esp+640-560]
+    pandn   xmm7, xmm6
+    movdqa   [esp+640-304], xmm7
+    movdqa  xmm7, [esp+640-560]
+    movdqa   [esp+640-528], xmm7
+    movdqa  xmm7, [esp+640-416]
+    paddw   xmm7, xmm6
+    paddw   xmm7, xmm5
+    paddw   xmm7, [esp+640-624]
+    movdqa  xmm5, [esp+640-528]
+    psraw   xmm7, 2
+    pand    xmm5, xmm7
+    movdqa   [esp+640-32], xmm5
 
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
+    movdqa  xmm5, [esp+640-544]
+    movdqa   [esp+640-528], xmm5
+    movdqa  xmm5, [esp+640-480]
+    movdqa  xmm7, xmm5
+    paddw   xmm7, xmm5
+    movdqa  xmm5, xmm1
+    paddw   xmm5, xmm6
+    paddw   xmm6, [esp+640-592]
+    paddw   xmm7, xmm5
+    paddw   xmm7, [esp+640-624]
+    movdqa  xmm5, [esp+640-528]
+    psraw   xmm7, 2
+    pandn   xmm5, xmm7
+    movdqa  xmm7, [esp+640-480]
+    paddw   xmm7, xmm1
+    paddw   xmm7, [esp+640-400]
+    movdqa  xmm1, [esp+640-544]
+    movdqa   [esp+640-352], xmm5
+    movdqa  xmm5, [esp+640-368]
+    psllw   xmm7, 1
+    paddw   xmm7, xmm6
+    paddw   xmm5, xmm7
 
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
+    movdqa  xmm7, [esp+640-400]
+    psraw   xmm5, 3
+    pand    xmm1, xmm5
+    movdqa  xmm5, [esp+640-480]
+    movdqa   [esp+640-96], xmm1
+    movdqa  xmm1, [esp+640-560]
+    movdqa   [esp+640-528], xmm1
+    movdqa  xmm1, [esp+640-384]
+    movdqa  xmm6, xmm1
+    paddw   xmm6, xmm1
+    paddw   xmm1, [esp+640-400]
+    paddw   xmm1, [esp+640-144]
+    paddw   xmm7, xmm5
+    paddw   xmm5, [esp+640-592]
+    paddw   xmm6, xmm7
+    paddw   xmm6, [esp+640-624]
+    movdqa  xmm7, [esp+640-528]
+    psraw   xmm6, 2
+    psllw   xmm1, 1
+    paddw   xmm1, xmm5
 
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
+    movdqa  xmm5, [esp+656-272]
+    pandn   xmm7, xmm6
+    movdqa  xmm6, [esp+640-416]
+    paddw   xmm6, xmm1
+    movdqa  xmm1, [esp+640-560]
+    psraw   xmm6, 3
+    pand    xmm1, xmm6
 
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
+    movdqa  xmm6, [esp+704-272]
+    movdqa   [esp+640-128], xmm1
+    movdqa  xmm1, [esp+672-272]
+    punpckhbw xmm1, xmm2
+    movdqa   [esp+640-448], xmm1
+    movdqa  xmm1, [esp+688-272]
+    punpckhbw xmm1, xmm2
+    punpckhbw xmm6, xmm2
+    movdqa   [esp+640-288], xmm7
+    punpckhbw xmm5, xmm2
+    movdqa   [esp+640-496], xmm1
+    movdqa   [esp+640-432], xmm6
 
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
+    movdqa  xmm7, [esp+720-272]
+    punpckhbw xmm7, xmm2
+    movdqa   [esp+640-464], xmm7
 
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
+    movdqa  xmm7, [esp+736-272]
+    punpckhbw xmm7, xmm2
+    movdqa   [esp+640-528], xmm7
 
-	movdqa	xmm7, xmm6
+    movdqa  xmm7, xmm6
 
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
+    psubw   xmm6, [esp+640-464]
+    psubw   xmm7, xmm1
+    pabsw   xmm7, xmm7
+    movdqa   [esp+640-560], xmm7
+    por xmm4, [esp+640-16]
+    pabsw   xmm6, xmm6
+    movdqa  xmm7, xmm1
+    psubw   xmm7, [esp+640-448]
 
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
+    movdqa  xmm1, [esp+640-512]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm1, xmm7
+    movdqa  xmm7, [esp+640-512]
+    pcmpgtw xmm7, xmm6
+    movdqa  xmm6, [esp+640-320]
+    pand    xmm1, xmm7
+    movdqa  xmm7, [esp+640-560]
+    pcmpgtw xmm6, xmm7
+    pand    xmm1, xmm6
 
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
+    movdqa  xmm6, [esp+640-576]
+    pcmpgtw xmm6, xmm7
 
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
+    movdqa  xmm7, [esp+640-496]
+    punpckhbw xmm3, xmm2
+    movdqa   [esp+640-560], xmm6
+    movdqa  xmm6, [esp+640-512]
+    psubw   xmm7, xmm5
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm6, xmm7
 
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
+    pand    xmm6, [esp+640-560]
+    movdqa  xmm7, [esp+640-432]
+    psubw   xmm7, [esp+640-528]
 
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
+    psllw   xmm3, 1
+    movdqa   [esp+640-544], xmm6
+    movdqa  xmm6, [esp+640-512]
 
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
+    movdqa  xmm2, [esp+640-544]
+    paddw   xmm3, xmm5
+    paddw   xmm3, xmm5
+    paddw   xmm3, xmm5
+    paddw   xmm3, [esp+640-448]
+    paddw   xmm3, [esp+640-496]
+    pabsw   xmm7, xmm7
+    pcmpgtw xmm6, xmm7
+    pand    xmm6, [esp+640-560]
+    movdqa   [esp+640-560], xmm6
 
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
+    movdqa  xmm6, xmm0
+    pand    xmm6, xmm4
+    movdqa  xmm4, xmm0
+    pandn   xmm4, [esp+640-368]
+    por xmm6, xmm4
+    movdqa  xmm4, [esp+640-432]
+    paddw   xmm3, xmm4
+    paddw   xmm3, [esp+640-592]
+    psraw   xmm3, 3
+    pand    xmm3, xmm2
+    pandn   xmm2, xmm5
+    por xmm3, xmm2
+    movdqa  xmm7, xmm1
+    pand    xmm7, xmm3
+    movdqa  xmm3, [esp+640-64]
+    por xmm3, [esp+640-336]
+    movdqa  xmm2, xmm1
+    pandn   xmm2, xmm5
+    por xmm7, xmm2
 
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
+    movdqa  xmm2, xmm0
+    pand    xmm2, xmm3
+    movdqa  xmm3, xmm0
+    pandn   xmm3, [esp+640-480]
+    por xmm2, xmm3
+    packuswb xmm6, xmm7
+    movdqa   [esp+640-336], xmm2
+    movdqa   [esp+656-272], xmm6
+    movdqa  xmm6, [esp+640-544]
+    movdqa  xmm2, xmm5
+    paddw   xmm2, [esp+640-448]
+    movdqa  xmm3, xmm1
+    movdqa  xmm7, [esp+640-496]
+    paddw   xmm7, xmm4
+    paddw   xmm2, xmm7
+    paddw   xmm2, [esp+640-624]
+    movdqa  xmm7, [esp+640-544]
+    psraw   xmm2, 2
+    pand    xmm6, xmm2
+    movdqa  xmm2, [esp+640-448]
+    pandn   xmm7, xmm2
+    por xmm6, xmm7
+    pand    xmm3, xmm6
+    movdqa  xmm6, xmm1
+    pandn   xmm6, xmm2
+    paddw   xmm2, [esp+640-496]
+    paddw   xmm2, xmm4
+    por xmm3, xmm6
+    movdqa  xmm6, [esp+640-336]
+    packuswb xmm6, xmm3
+    psllw   xmm2, 1
+    movdqa   [esp+672-272], xmm6
+    movdqa  xmm6, [esp+640-96]
+    por xmm6, [esp+640-352]
 
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
+    movdqa  xmm3, xmm0
+    pand    xmm3, xmm6
+    movdqa  xmm6, xmm0
+    pandn   xmm6, [esp+640-144]
+    por xmm3, xmm6
+    movdqa  xmm6, [esp+640-544]
+    movdqa   [esp+640-352], xmm3
+    movdqa  xmm3, [esp+640-464]
+    paddw   xmm3, [esp+640-592]
+    paddw   xmm2, xmm3
+    movdqa  xmm3, [esp+640-448]
+    paddw   xmm5, xmm2
+    movdqa  xmm2, [esp+640-496]
+    psraw   xmm5, 3
+    pand    xmm6, xmm5
+    movdqa  xmm5, [esp+640-464]
+    paddw   xmm2, xmm5
+    paddw   xmm5, [esp+640-432]
+    movdqa  xmm4, xmm3
+    paddw   xmm4, xmm3
+    paddw   xmm4, xmm2
+    paddw   xmm4, [esp+640-624]
+    movdqa  xmm2, [esp+640-544]
+    paddw   xmm3, [esp+640-592]
+    psraw   xmm4, 2
+    pandn   xmm2, xmm4
+    por xmm6, xmm2
+    movdqa  xmm7, xmm1
+    pand    xmm7, xmm6
+    movdqa  xmm6, [esp+640-496]
+    movdqa  xmm2, xmm1
+    pandn   xmm2, xmm6
+    por xmm7, xmm2
+    movdqa  xmm2, [esp+640-352]
+    packuswb xmm2, xmm7
+    movdqa   [esp+688-272], xmm2
+    movdqa  xmm2, [esp+640-128]
+    por xmm2, [esp+640-288]
 
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
+    movdqa  xmm4, xmm0
+    pand    xmm4, xmm2
+    paddw   xmm5, xmm6
+    movdqa  xmm2, xmm0
+    pandn   xmm2, [esp+640-400]
+    por xmm4, xmm2
+    movdqa  xmm2, [esp+640-528]
+    psllw   xmm5, 1
+    paddw   xmm5, xmm3
+    movdqa  xmm3, [esp+640-560]
+    paddw   xmm2, xmm5
+    psraw   xmm2, 3
+    movdqa   [esp+640-288], xmm4
+    movdqa  xmm4, [esp+640-560]
+    pand    xmm4, xmm2
+    movdqa  xmm2, [esp+640-464]
+    movdqa  xmm5, xmm2
+    paddw   xmm5, xmm2
+    movdqa  xmm2, [esp+640-432]
+    paddw   xmm2, [esp+640-448]
+    movdqa  xmm7, xmm1
+    paddw   xmm5, xmm2
+    paddw   xmm5, [esp+640-624]
+    movdqa  xmm6, [esp+640-560]
+    psraw   xmm5, 2
+    pandn   xmm3, xmm5
+    por xmm4, xmm3
+    movdqa  xmm3, [esp+640-32]
+    por xmm3, [esp+640-304]
+    pand    xmm7, xmm4
+    movdqa  xmm4, [esp+640-432]
+    movdqa  xmm5, [esp+640-464]
+    movdqa  xmm2, xmm1
+    pandn   xmm2, xmm4
+    paddw   xmm4, [esp+640-496]
+    por xmm7, xmm2
+    movdqa  xmm2, [esp+640-288]
+    packuswb xmm2, xmm7
+    movdqa   [esp+704-272], xmm2
 
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
+    movdqa  xmm2, xmm0
+    pand    xmm2, xmm3
+    movdqa  xmm3, xmm0
+    pandn   xmm3, [esp+640-384]
+    por xmm2, xmm3
+    movdqa   [esp+640-304], xmm2
+    movdqa  xmm2, [esp+640-528]
+    movdqa  xmm3, xmm2
+    paddw   xmm3, [esp+640-464]
+    paddw   xmm3, xmm4
+    paddw   xmm3, [esp+640-624]
+    psraw   xmm3, 2
+    pand    xmm6, xmm3
+    movdqa  xmm3, [esp+640-560]
+    movdqa  xmm4, xmm3
+    pandn   xmm4, xmm5
+    por xmm6, xmm4
+    movdqa  xmm7, xmm1
+    pand    xmm7, xmm6
+    movdqa  xmm6, [esp+640-304]
+    movdqa  xmm4, xmm1
+    pandn   xmm4, xmm5
+    por xmm7, xmm4
 
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
+    movdqa  xmm4, xmm0
+    pandn   xmm0, [esp+640-416]
+    packuswb xmm6, xmm7
+    movdqa  xmm7, [esp+640-112]
+    por xmm7, [esp+640-80]
+    pand    xmm4, xmm7
+    por xmm4, xmm0
+    movdqa  xmm0, [esp+752-272]
+    punpckhbw xmm0, [esp+640-48]
+    psllw   xmm0, 1
+    paddw   xmm0, xmm2
+    paddw   xmm0, xmm2
+    paddw   xmm0, xmm2
+    paddw   xmm0, xmm5
+    paddw   xmm0, [esp+640-432]
+    paddw   xmm0, [esp+640-496]
+    paddw   xmm0, [esp+640-592]
+    psraw   xmm0, 3
+    pand    xmm0, xmm3
+    movdqa  xmm7, xmm1
+    pandn   xmm3, xmm2
+    por xmm0, xmm3
+    pand    xmm7, xmm0
 
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
+    movdqa  xmm0, [esp+656-272]
+    movdqa   [edx], xmm0
 
-	movdqa	xmm0, [esp+672-272]
+    movdqa  xmm0, [esp+672-272]
 
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
+    mov edx, dword [esp+640-596]
+    movdqa   [esi], xmm0
+    movdqa  xmm0, [esp+688-272]
+    movdqa   [edi], xmm0
+    movdqa  xmm0, [esp+704-272]
 
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
+    pop edi
+    pandn   xmm1, xmm2
+    movdqa   [eax], xmm0
+    por xmm7, xmm1
+    pop esi
+    packuswb xmm4, xmm7
+    movdqa   [edx], xmm6
+    movdqa   [ecx], xmm4
+    pop ebx
+    mov esp, ebp
+    pop ebp
+    ret
 
 %endif
 
--- a/codec/common/x86/expand_picture.asm
+++ b/codec/common/x86/expand_picture.asm
@@ -77,280 +77,280 @@
 ;cccc|ceeeeeeeeeeeeeeeed|dddd
 ;cccc|ceeeeeeeeeeeeeeeed|dddd
 
-%macro mov_line_8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
+%macro mov_line_8x4_mmx     3   ; dst, stride, mm?
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+2*%2]
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+2*%2]
 %endmacro
 
-%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+%2]
+%macro mov_line_end8x4_mmx      3   ; dst, stride, mm?
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+2*%2]
+    movq [%1], %3
+    movq [%1+%2], %3
+    lea %1, [%1+%2]
 %endmacro
 
-%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
+%macro mov_line_16x4_sse2   4   ; dst, stride, xmm?, u/a
+    movdq%4 [%1], %3        ; top(bottom)_0
+    movdq%4 [%1+%2], %3     ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdq%4 [%1], %3        ; top(bottom)_2
+    movdq%4 [%1+%2], %3     ; top(bottom)_3
+    lea %1, [%1+2*%2]
 %endmacro
 
-%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+%2]
+%macro mov_line_end16x4_sse2    4   ; dst, stride, xmm?, u/a
+    movdq%4 [%1], %3        ; top(bottom)_0
+    movdq%4 [%1+%2], %3     ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdq%4 [%1], %3        ; top(bottom)_2
+    movdq%4 [%1+%2], %3     ; top(bottom)_3
+    lea %1, [%1+%2]
 %endmacro
 
-%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
+%macro mov_line_32x4_sse2   3   ; dst, stride, xmm?
+    movdqa [%1], %3         ; top(bottom)_0
+    movdqa [%1+16], %3      ; top(bottom)_0
+    movdqa [%1+%2], %3      ; top(bottom)_1
+    movdqa [%1+%2+16], %3       ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdqa [%1], %3         ; top(bottom)_2
+    movdqa [%1+16], %3      ; top(bottom)_2
+    movdqa [%1+%2], %3      ; top(bottom)_3
+    movdqa [%1+%2+16], %3       ; top(bottom)_3
+    lea %1, [%1+2*%2]
 %endmacro
 
-%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+%2]
+%macro mov_line_end32x4_sse2    3   ; dst, stride, xmm?
+    movdqa [%1], %3         ; top(bottom)_0
+    movdqa [%1+16], %3      ; top(bottom)_0
+    movdqa [%1+%2], %3      ; top(bottom)_1
+    movdqa [%1+%2+16], %3       ; top(bottom)_1
+    lea %1, [%1+2*%2]
+    movdqa [%1], %3         ; top(bottom)_2
+    movdqa [%1+16], %3      ; top(bottom)_2
+    movdqa [%1+%2], %3      ; top(bottom)_3
+    movdqa [%1+%2+16], %3       ; top(bottom)_3
+    lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
+%macro exp_top_bottom_sse2  1   ; iPaddingSize [luma(32)/chroma(16)]
     ;r2 [width/16(8)]
     ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
     ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
 
-%if %1 == 32		; for luma
-	sar r2, 04h 	; width / 16(8) pixels
+%if %1 == 32        ; for luma
+    sar r2, 04h     ; width / 16(8) pixels
 .top_bottom_loops:
-	; top
-	movdqa xmm0, [r0]		; first line of picture pData
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_end16x4_sse2 r5, r1, xmm0, a
+    ; top
+    movdqa xmm0, [r0]       ; first line of picture pData
+    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_end16x4_sse2 r5, r1, xmm0, a
 
-	; bottom
-	movdqa xmm1, [r3] 		; last line of picture pData
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_end16x4_sse2 r4, r1, xmm1, a
+    ; bottom
+    movdqa xmm1, [r3]       ; last line of picture pData
+    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_end16x4_sse2 r4, r1, xmm1, a
 
-	lea r0, [r0+16]		; top pSrc
-	lea r5, [r5+16]		; top dst
-	lea r3, [r3+16]		; bottom pSrc
-	lea r4, [r4+16]		; bottom dst
-	neg r1 			; positive/negative stride need for next loop?
+    lea r0, [r0+16]     ; top pSrc
+    lea r5, [r5+16]     ; top dst
+    lea r3, [r3+16]     ; bottom pSrc
+    lea r4, [r4+16]     ; bottom dst
+    neg r1          ; positive/negative stride need for next loop?
 
-	dec r2
-	jnz near .top_bottom_loops
-%elif %1 == 16	; for chroma ??
-	mov r6, r2
-	sar r2, 04h 	; (width / 16) pixels
+    dec r2
+    jnz near .top_bottom_loops
+%elif %1 == 16  ; for chroma ??
+    mov r6, r2
+    sar r2, 04h     ; (width / 16) pixels
 .top_bottom_loops:
-	; top
-	movdqa xmm0, [r0]		; first line of picture pData
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_end16x4_sse2 r5, r1, xmm0, a
+    ; top
+    movdqa xmm0, [r0]       ; first line of picture pData
+    mov_line_16x4_sse2 r5, r1, xmm0, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_16x4_sse2 r5, r1, xmm0, a
+    mov_line_end16x4_sse2 r5, r1, xmm0, a
 
-	; bottom
-	movdqa xmm1, [r3] 		; last line of picture pData
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_end16x4_sse2 r4, r1, xmm1, a
+    ; bottom
+    movdqa xmm1, [r3]       ; last line of picture pData
+    mov_line_16x4_sse2 r4, r1, xmm1, a  ; dst, stride, xmm?
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_16x4_sse2 r4, r1, xmm1, a
+    mov_line_end16x4_sse2 r4, r1, xmm1, a
 
-	lea r0, [r0+16]		; top pSrc
-	lea r5, [r5+16]		; top dst
-	lea r3, [r3+16]		; bottom pSrc
-	lea r4, [r4+16]		; bottom dst
-	neg r1 			; positive/negative stride need for next loop?
+    lea r0, [r0+16]     ; top pSrc
+    lea r5, [r5+16]     ; top dst
+    lea r3, [r3+16]     ; bottom pSrc
+    lea r4, [r4+16]     ; bottom dst
+    neg r1          ; positive/negative stride need for next loop?
 
-	dec r2
-	jnz near .top_bottom_loops
+    dec r2
+    jnz near .top_bottom_loops
 
-	; for remaining 8 bytes
-	and r6, 0fh		; any 8 bytes left?
-	test r6, r6
-	jz near .to_be_continued	; no left to exit here
+    ; for remaining 8 bytes
+    and r6, 0fh     ; any 8 bytes left?
+    test r6, r6
+    jz near .to_be_continued    ; no left to exit here
 
-	; top
-	movq mm0, [r0]		; remained 8 byte
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_end8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	; bottom
-	movq mm1, [r3]
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_end8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	WELSEMMS
+    ; top
+    movq mm0, [r0]      ; remained 8 byte
+    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
+    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
+    mov_line_8x4_mmx r5, r1, mm0    ; dst, stride, mm?
+    mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+    ; bottom
+    movq mm1, [r3]
+    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
+    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
+    mov_line_8x4_mmx r4, r1, mm1    ; dst, stride, mm?
+    mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+    WELSEMMS
 
 .to_be_continued:
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+%macro exp_left_right_sse2  2   ; iPaddingSize [luma(32)/chroma(16)], u/a
     ;r6 [height]
     ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
     ;r3 [pSrc+(w-1)] r4[pSrc+w]
 
-%if %1 == 32		; for luma
+%if %1 == 32        ; for luma
 .left_right_loops:
-	; left
-	movzx r2d, byte [r0]		; pixel pData for left border
-	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r5], xmm0
-	movdqa [r5+16], xmm0
+    ; left
+    movzx r2d, byte [r0]        ; pixel pData for left border
+    SSE2_Copy16Times    xmm0, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdqa [r5], xmm0
+    movdqa [r5+16], xmm0
 
-	; right
-	movzx r2d, byte [r3]
-	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r4], xmm1
-	movdqa [r4+16], xmm1
+    ; right
+    movzx r2d, byte [r3]
+    SSE2_Copy16Times    xmm1, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdqa [r4], xmm1
+    movdqa [r4+16], xmm1
 
-	lea r0, [r0+r1]		; left pSrc
-	lea r5, [r5+r1]		; left dst
-	lea r3, [r3+r1]		; right pSrc
-	lea r4, [r4+r1]		; right dst
+    lea r0, [r0+r1]     ; left pSrc
+    lea r5, [r5+r1]     ; left dst
+    lea r3, [r3+r1]     ; right pSrc
+    lea r4, [r4+r1]     ; right dst
 
-	dec r6
-	jnz near .left_right_loops
-%elif %1 == 16	; for chroma ??
+    dec r6
+    jnz near .left_right_loops
+%elif %1 == 16  ; for chroma ??
 .left_right_loops:
-	; left
-	movzx r2d, byte [r0]		; pixel pData for left border
-	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r5], xmm0
+    ; left
+    movzx r2d, byte [r0]        ; pixel pData for left border
+    SSE2_Copy16Times    xmm0, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdqa [r5], xmm0
 
-	; right
-	movzx r2d, byte [r3]
-	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdq%2 [r4], xmm1								; might not be aligned 16 bytes in case chroma planes
+    ; right
+    movzx r2d, byte [r3]
+    SSE2_Copy16Times    xmm1, r2d               ; dst, tmp, pSrc [generic register name: a/b/c/d]
+    movdq%2 [r4], xmm1                              ; might not be aligned 16 bytes in case chroma planes
 
-	lea r0, [r0+r1]		; left pSrc
-	lea r5, [r5+r1]		; left dst
-	lea r3, [r3+r1]		; right pSrc
-	lea r4, [r4+r1]		; right dst
+    lea r0, [r0+r1]     ; left pSrc
+    lea r5, [r5+r1]     ; left dst
+    lea r3, [r3+r1]     ; right pSrc
+    lea r4, [r4+r1]     ; right dst
 
-	dec r6
-	jnz near .left_right_loops
+    dec r6
+    jnz near .left_right_loops
 %endif
 %endmacro
 
-%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
-	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+%macro exp_cross_sse2   2   ; iPaddingSize [luma(32)/chroma(16)], u/a
+    ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+    ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
     ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
-%if %1 == 32		; luma
-	; TL
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+%if %1 == 32        ; luma
+    ; TL
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r3, r1, xmm3    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r3, r1, xmm3    ; dst, stride, xmm?
 
-	; TR
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+    ; TR
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r4, r1, xmm4    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r4, r1, xmm4    ; dst, stride, xmm?
 
-	; BL
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+    ; BL
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r5, r1, xmm5    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r5, r1, xmm5    ; dst, stride, xmm?
 
-	; BR
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-%elif %1 == 16	; chroma
-	; TL
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+    ; BR
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_32x4_sse2  r6, r1, xmm6    ; dst, stride, xmm?
+    mov_line_end32x4_sse2   r6, r1, xmm6    ; dst, stride, xmm?
+%elif %1 == 16  ; chroma
+    ; TL
+    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r3, r1, xmm3, a ; dst, stride, xmm?
+    mov_line_end16x4_sse2   r3, r1, xmm3, a ; dst, stride, xmm?
 
-	; TR
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2 r4, r1, xmm4, %2	; dst, stride, xmm?
+    ; TR
+    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r4, r1, xmm4, %2    ; dst, stride, xmm?
+    mov_line_end16x4_sse2 r4, r1, xmm4, %2  ; dst, stride, xmm?
 
-	; BL
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+    ; BL
+    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
+    mov_line_16x4_sse2  r5, r1, xmm5, a ; dst, stride, xmm?
+    mov_line_end16x4_sse2   r5, r1, xmm5, a ; dst, stride, xmm?
 
-	; BR
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+    ; BR
+    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
+    mov_line_16x4_sse2  r6, r1, xmm6, %2    ; dst, stride, xmm?
+    mov_line_end16x4_sse2   r6, r1, xmm6, %2    ; dst, stride, xmm?
 %endif
 %endmacro
 
 ;***********************************************************************----------------
-; void ExpandPictureLuma_sse2(	uint8_t *pDst,
-;									const int32_t iStride,
-;									const int32_t iWidth,
-;									const int32_t iHeight	);
+; void ExpandPictureLuma_sse2(  uint8_t *pDst,
+;                                   const int32_t iStride,
+;                                   const int32_t iWidth,
+;                                   const int32_t iHeight   );
 ;***********************************************************************----------------
 WELS_EXTERN ExpandPictureLuma_sse2
 
@@ -403,8 +403,8 @@
 
     exp_top_bottom_sse2 32
 
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for both left and right border
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
     pop r2
     pop r1
@@ -416,8 +416,8 @@
     lea r4,[r3+1]                           ;right border dst
 
     ;prepare for cross border data: top-rigth with xmm4
-     movzx r6d,byte [r3]                         ;top -rigth
-     SSE2_Copy16Times xmm4,r6d
+    movzx r6d,byte [r3]                         ;top -rigth
+    SSE2_Copy16Times xmm4,r6d
 
     neg r1   ;r1 = stride
 
@@ -438,8 +438,8 @@
     pop r1
     pop r0
 
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for cross border [top-left, top-right, bottom-left, bottom-right]
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 
     neg r1  ;r1 = -stride
@@ -472,13 +472,13 @@
     %assign push_num 0
 
 
-	ret
+    ret
 
 ;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
+; void ExpandPictureChromaAlign_sse2(   uint8_t *pDst,
+;                                       const int32_t iStride,
+;                                       const int32_t iWidth,
+;                                       const int32_t iHeight   );
 ;***********************************************************************----------------
 WELS_EXTERN ExpandPictureChromaAlign_sse2
 
@@ -531,8 +531,8 @@
 
     exp_top_bottom_sse2 16
 
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for both left and right border
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
     pop r2
     pop r1
@@ -557,7 +557,7 @@
     push r0
     push r1
     push r2
-	push r6
+    push r6
     exp_left_right_sse2 16,a
 
     pop r6
@@ -565,8 +565,8 @@
     pop r1
     pop r0
 
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for cross border [top-left, top-right, bottom-left, bottom-right]
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 
     neg r1  ;r1 = -stride
@@ -599,16 +599,16 @@
     %assign push_num 0
 
 
-	ret
+    ret
 
 ;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
+; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
+;                                       const int32_t iStride,
+;                                       const int32_t iWidth,
+;                                       const int32_t iHeight   );
 ;***********************************************************************----------------
 WELS_EXTERN ExpandPictureChromaUnalign_sse2
-	push r4
+    push r4
     push r5
     push r6
 
@@ -657,8 +657,8 @@
 
     exp_top_bottom_sse2 16
 
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for both left and right border
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
     pop r2
     pop r1
@@ -683,7 +683,7 @@
     push r0
     push r1
     push r2
-	push r6
+    push r6
     exp_left_right_sse2 16,u
 
     pop r6
@@ -691,8 +691,8 @@
     pop r1
     pop r0
 
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; for cross border [top-left, top-right, bottom-left, bottom-right]
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 
     neg r1  ;r1 = -stride
@@ -725,4 +725,4 @@
     %assign push_num 0
 
 
-	ret
+    ret
--- a/codec/common/x86/mb_copy.asm
+++ b/codec/common/x86/mb_copy.asm
@@ -36,9 +36,9 @@
 ;*
 ;*  History
 ;*      15/09/2009 Created
-;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*      12/28/2009 Modified with larger throughput
+;*      12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;*                 WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
 ;*
 ;*
 ;*********************************************************************************************/
@@ -56,174 +56,174 @@
 
 
 ;***********************************************************************
-; void WelsCopy16x16_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
+; void WelsCopy16x16_sse2(  uint8_t* Dst,
+;                           int32_t  iStrideD,
+;                           uint8_t* Src,
+;                           int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy16x16_sse2
 
-	push r4
-	push r5
-	%assign  push_num 2
+    push r4
+    push r5
+    %assign  push_num 2
     LOAD_4_PARA
     PUSH_XMM 8
 
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
+    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
 
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+2*r3]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+2*r3]
+    movdqa xmm7, [r2+r5]
+    lea r2, [r2+4*r3]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    lea r0, [r0+4*r1]
 
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+2*r3]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+2*r3]
+    movdqa xmm7, [r2+r5]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 ;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
+; void WelsCopy16x16NotAligned_sse2(    uint8_t* Dst,
+;                           int32_t  iStrideD,
+;                           uint8_t* Src,
+;                           int32_t  iStrideS )
 ;***********************************************************************
 ; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
 WELS_EXTERN WelsCopy16x16NotAligned_sse2
-	push r4
-	push r5
-	%assign  push_num 2
+    push r4
+    push r5
+    %assign  push_num 2
     LOAD_4_PARA
     PUSH_XMM 8
 
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
+    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
 
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
+    movdqu xmm0, [r2]
+    movdqu xmm1, [r2+r3]
+    movdqu xmm2, [r2+2*r3]
+    movdqu xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqu xmm4, [r2]
+    movdqu xmm5, [r2+r3]
+    movdqu xmm6, [r2+2*r3]
+    movdqu xmm7, [r2+r5]
+    lea r2, [r2+4*r3]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    lea r0, [r0+4*r1]
 
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
+    movdqu xmm0, [r2]
+    movdqu xmm1, [r2+r3]
+    movdqu xmm2, [r2+2*r3]
+    movdqu xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqu xmm4, [r2]
+    movdqu xmm5, [r2+r3]
+    movdqu xmm6, [r2+2*r3]
+    movdqu xmm7, [r2+r5]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 ; , 12/29/2011
 ;***********************************************************************
 ; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
+;                           int32_t  iStrideD,
+;                           uint8_t* Src,
+;                           int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy16x8NotAligned_sse2
-	push r4
-	push r5
-	%assign  push_num 2
+    push r4
+    push r5
+    %assign  push_num 2
     LOAD_4_PARA
     PUSH_XMM 8
 
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+    lea r4, [r1+2*r1]   ;ebx, [eax+2*eax]   ; x3
+    lea r5, [r3+2*r3]   ;edx, [ecx+2*ecx]   ; x3
 
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
+    movdqu xmm0, [r2]
+    movdqu xmm1, [r2+r3]
+    movdqu xmm2, [r2+2*r3]
+    movdqu xmm3, [r2+r5]
+    lea r2, [r2+4*r3]
+    movdqu xmm4, [r2]
+    movdqu xmm5, [r2+r3]
+    movdqu xmm6, [r2+2*r3]
+    movdqu xmm7, [r2+r5]
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm2
+    movdqa [r0+r4], xmm3
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm4
+    movdqa [r0+r1], xmm5
+    movdqa [r0+2*r1], xmm6
+    movdqa [r0+r4], xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 
 ;***********************************************************************
@@ -233,62 +233,62 @@
 ;                       int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy8x16_mmx
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_4_PARA
 
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-	lea r2, [r2+2*r3]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
+    movq mm7, [r2+r3]
+    lea r2, [r2+2*r3]
 
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-	lea r0, [r0+2*r1]
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    lea r0, [r0+2*r1]
+    movq [r0], mm2
+    movq [r0+r1], mm3
+    lea r0, [r0+2*r1]
+    movq [r0], mm4
+    movq [r0+r1], mm5
+    lea r0, [r0+2*r1]
+    movq [r0], mm6
+    movq [r0+r1], mm7
+    lea r0, [r0+2*r1]
 
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
+    movq mm7, [r2+r3]
 
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    lea r0, [r0+2*r1]
+    movq [r0], mm2
+    movq [r0+r1], mm3
+    lea r0, [r0+2*r1]
+    movq [r0], mm4
+    movq [r0+r1], mm5
+    lea r0, [r0+2*r1]
+    movq [r0], mm6
+    movq [r0+r1], mm7
 
-	WELSEMMS
-	LOAD_4_PARA_POP
-	ret
+    WELSEMMS
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ; void WelsCopy8x8_mmx(  uint8_t* Dst,
@@ -297,48 +297,48 @@
 ;                        int32_t  iStrideS )
 ;***********************************************************************
 WELS_EXTERN WelsCopy8x8_mmx
-	push r4
-	%assign  push_num 1
+    push r4
+    %assign  push_num 1
     LOAD_4_PARA
-	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
+    lea r4, [r3+2*r3]   ;edx, [ebx+2*ebx]
 
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
+    ; to prefetch next loop
+    prefetchnta [r2+2*r3]
+    prefetchnta [r2+r4]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    ; to prefetch next loop
+    prefetchnta [r2+2*r3]
+    prefetchnta [r2+r4]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    ; to prefetch next loop
+    prefetchnta [r2+2*r3]
+    prefetchnta [r2+r4]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
+    movq mm7, [r2+r3]
 
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    lea r0, [r0+2*r1]
+    movq [r0], mm2
+    movq [r0+r1], mm3
+    lea r0, [r0+2*r1]
+    movq [r0], mm4
+    movq [r0+r1], mm5
+    lea r0, [r0+2*r1]
+    movq [r0], mm6
+    movq [r0+r1], mm7
 
-	WELSEMMS
-	LOAD_4_PARA_POP
-	pop r4
-	ret
+    WELSEMMS
+    LOAD_4_PARA_POP
+    pop r4
+    ret
 
 ; (dunhuang@cisco), 12/21/2011
 ;***********************************************************************
@@ -349,13 +349,13 @@
     %assign  push_num 0
     LOAD_2_PARA
 
-	movd xmm0, r1d	; _mv
-	pshufd xmm1, xmm0, $00
-	movdqa [r0     ], xmm1
-	movdqa [r0+0x10], xmm1
-	movdqa [r0+0x20], xmm1
-	movdqa [r0+0x30], xmm1
-	ret
+    movd xmm0, r1d  ; _mv
+    pshufd xmm1, xmm0, $00
+    movdqa [r0     ], xmm1
+    movdqa [r0+0x10], xmm1
+    movdqa [r0+0x20], xmm1
+    movdqa [r0+0x30], xmm1
+    ret
 
 ;*******************************************************************************
 ; Macros and other preprocessor constants
@@ -381,14 +381,14 @@
     %assign  push_num 0
     LOAD_7_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    SIGN_EXTENSION  r6, r6d
 
 ALIGN 4
 .height_loop:
-	movd        mm0, [r4]
+    movd        mm0, [r4]
     pavgb       mm0, [r2]
     movd        [r0], mm0
 
@@ -398,8 +398,8 @@
     lea         r4, [r4+r5]
     jne         .height_loop
 
-	WELSEMMS
-	LOAD_7_PARA_POP
+    WELSEMMS
+    LOAD_7_PARA_POP
     ret
 
 
@@ -413,29 +413,29 @@
     %assign  push_num 0
     LOAD_7_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    SIGN_EXTENSION  r6, r6d
 
 ALIGN 4
 .height_loop:
-	movq        mm0, [r2]
+    movq        mm0, [r2]
     pavgb       mm0, [r4]
     movq        [r0], mm0
     movq        mm0, [r2+r3]
     pavgb       mm0, [r4+r5]
-    movq		[r0+r1], mm0
+    movq        [r0+r1], mm0
 
-    lea			r2,  [r2+2*r3]
-    lea			r4,  [r4+2*r5]
-    lea			r0,  [r0+2*r1]
+    lea         r2,  [r2+2*r3]
+    lea         r4,  [r4+2*r5]
+    lea         r0,  [r0+2*r1]
 
     sub         r6, 2
     jnz         .height_loop
 
-	WELSEMMS
-	LOAD_7_PARA_POP
+    WELSEMMS
+    LOAD_7_PARA_POP
     ret
 
 
@@ -450,46 +450,46 @@
 
     %assign  push_num 0
     LOAD_7_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
+    SIGN_EXTENSION  r6, r6d
 ALIGN 4
 .height_loop:
-	movdqu      xmm0, [r2]
-	movdqu	    xmm1, [r4]
-	pavgb	    xmm0, xmm1
-	;pavgb       xmm0, [r4]
+    movdqu      xmm0, [r2]
+    movdqu      xmm1, [r4]
+    pavgb       xmm0, xmm1
+    ;pavgb       xmm0, [r4]
     movdqu      [r0], xmm0
 
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
+    movdqu      xmm0, [r2+r3]
+    movdqu      xmm1, [r4+r5]
+    pavgb       xmm0, xmm1
     movdqu      [r0+r1], xmm0
 
-	movdqu      xmm0, [r2+2*r3]
-	movdqu       xmm1, [r4+2*r5]
-	pavgb	    xmm0, xmm1
+    movdqu      xmm0, [r2+2*r3]
+    movdqu       xmm1, [r4+2*r5]
+    pavgb       xmm0, xmm1
     movdqu      [r0+2*r1], xmm0
 
     lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
+    lea         r4, [r4+2*r5]
+    lea         r0, [r0+2*r1]
 
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
+    movdqu      xmm0, [r2+r3]
+    movdqu      xmm1, [r4+r5]
+    pavgb       xmm0, xmm1
     movdqu      [r0+r1], xmm0
 
     lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
+    lea         r4, [r4+2*r5]
+    lea         r0, [r0+2*r1]
 
     sub         r6, 4
     jne         .height_loop
 
-	WELSEMMS
-	LOAD_7_PARA_POP
+    WELSEMMS
+    LOAD_7_PARA_POP
     ret
 
 ;*******************************************************************************
@@ -497,26 +497,26 @@
 ;                          uint8_t *pDst, int iDstStride, int iHeight )
 ;*******************************************************************************
 WELS_EXTERN McCopyWidthEq4_mmx
-    push	r5
+    push    r5
     %assign  push_num 1
     LOAD_5_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 
 ALIGN 4
 .height_loop:
-	mov r5d, [r0]
-	mov [r2], r5d
+    mov r5d, [r0]
+    mov [r2], r5d
 
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-	WELSEMMS
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .height_loop
+    WELSEMMS
     LOAD_5_PARA_POP
-    pop	   r5
+    pop    r5
     ret
 
 ;*******************************************************************************
@@ -527,21 +527,21 @@
     %assign  push_num 0
     LOAD_5_PARA
 
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 
 ALIGN 4
 .height_loop:
-	movq mm0, [r0]
-	movq [r2], mm0
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
+    movq mm0, [r0]
+    movq [r2], mm0
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .height_loop
 
-	WELSEMMS
-	LOAD_5_PARA_POP
+    WELSEMMS
+    LOAD_5_PARA_POP
     ret
 
 
@@ -550,32 +550,32 @@
 ;*******************************************************************************
 ;read unaligned memory
 %macro SSE_READ_UNA 2
-	movq	%1, [%2]
-	movhps	%1,	[%2+8]
+    movq    %1, [%2]
+    movhps  %1, [%2+8]
 %endmacro
 
 ;write unaligned memory
 %macro SSE_WRITE_UNA 2
-	movq	[%1],	%2
-	movhps	[%1+8], %2
+    movq    [%1],   %2
+    movhps  [%1+8], %2
 %endmacro
 WELS_EXTERN McCopyWidthEq16_sse2
     %assign  push_num 0
     LOAD_5_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 ALIGN 4
 .height_loop:
-    SSE_READ_UNA	xmm0, r0
-    SSE_READ_UNA	xmm1, r0+r1
-    SSE_WRITE_UNA	r2, xmm0
-    SSE_WRITE_UNA	r2+r3, xmm1
+    SSE_READ_UNA    xmm0, r0
+    SSE_READ_UNA    xmm1, r0+r1
+    SSE_WRITE_UNA   r2, xmm0
+    SSE_WRITE_UNA   r2+r3, xmm1
 
-	sub		r4,	2
+    sub     r4, 2
     lea     r0, [r0+r1*2]
     lea     r2, [r2+r3*2]
     jnz     .height_loop
 
-	LOAD_5_PARA_POP
+    LOAD_5_PARA_POP
     ret
--- a/codec/common/x86/mc_chroma.asm
+++ b/codec/common/x86/mc_chroma.asm
@@ -53,10 +53,10 @@
 
 ALIGN 16
 h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
+    dw 32,32,32,32,32,32,32,32
 ALIGN 16
 h264_d0x20_mmx:
-	dw 32,32,32,32
+    dw 32,32,32,32
 
 
 ;=============================================================================
@@ -67,152 +67,152 @@
 
 ;*******************************************************************************
 ; void McChromaWidthEq4_mmx( const uint8_t *src,
-;							int32_t iSrcStride,
-;							uint8_t *pDst,
-;							int32_t iDstStride,
-;							const uint8_t *pABCD,
-;							int32_t iHeigh );
+;                           int32_t iSrcStride,
+;                           uint8_t *pDst,
+;                           int32_t iDstStride,
+;                           const uint8_t *pABCD,
+;                           int32_t iHeigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq4_mmx
-	%assign  push_num 0
-	LOAD_6_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
+    %assign  push_num 0
+    LOAD_6_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
 
-	movd mm3, [r4];	[eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3
-	punpckhwd mm4, mm4
+    movd mm3, [r4]; [eax]
+    WELS_Zero mm7
+    punpcklbw mm3, mm3
+    movq      mm4, mm3
+    punpcklwd mm3, mm3
+    punpckhwd mm4, mm4
 
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
+    movq      mm5, mm3
+    punpcklbw mm3, mm7
+    punpckhbw mm5, mm7
 
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
+    movq      mm6, mm4
+    punpcklbw mm4, mm7
+    punpckhbw mm6, mm7
 
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movd mm0, [r0]
-	movd mm1, [r0+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
+    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+    movd mm0, [r0]
+    movd mm1, [r0+1]
+    punpcklbw mm0, mm7
+    punpcklbw mm1, mm7
 .xloop:
 
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
+    pmullw mm0, mm3
+    pmullw mm1, mm5
+    paddw  mm0, mm1
 
-	movd  mm1, [r4]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
+    movd  mm1, [r4]
+    punpcklbw mm1, mm7
+    movq mm2, mm1
+    pmullw mm1, mm4
+    paddw mm0, mm1
 
-	movd mm1, [r4+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
+    movd mm1, [r4+1]
+    punpcklbw mm1, mm7
+    movq mm7, mm1
+    pmullw mm1,mm6
+    paddw mm0, mm1
+    movq mm1,mm7
 
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
+    paddw mm0, [h264_d0x20_mmx]
+    psrlw mm0, 6
 
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [r2], mm0
+    WELS_Zero mm7
+    packuswb mm0, mm7
+    movd [r2], mm0
 
-	movq mm0, mm2
+    movq mm0, mm2
 
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
+    lea r2, [r2 + r3]
+    lea r4, [r4 + r1]
 
-	dec r5
-	jnz near .xloop
-	WELSEMMS
-	LOAD_6_PARA_POP
-	ret
+    dec r5
+    jnz near .xloop
+    WELSEMMS
+    LOAD_6_PARA_POP
+    ret
 
 
 ;*******************************************************************************
 ; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
-;						int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						const uint8_t *pABCD,
-;						int32_t iheigh );
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;                       const uint8_t *pABCD,
+;                       int32_t iheigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq8_sse2
-	%assign  push_num 0
-	LOAD_6_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
 
-	movd xmm3, [r4]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
+    movd xmm3, [r4]
+    WELS_Zero xmm7
+    punpcklbw  xmm3, xmm3
+    punpcklwd  xmm3, xmm3
 
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
+    movdqa     xmm4, xmm3
+    punpckldq  xmm3, xmm3
+    punpckhdq  xmm4, xmm4
+    movdqa     xmm5, xmm3
+    movdqa     xmm6, xmm4
 
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
+    punpcklbw  xmm3, xmm7
+    punpckhbw  xmm5, xmm7
+    punpcklbw  xmm4, xmm7
+    punpckhbw  xmm6, xmm7
 
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movq xmm0, [r0]
-	movq xmm1, [r0+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
+    lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+    movq xmm0, [r0]
+    movq xmm1, [r0+1]
+    punpcklbw xmm0, xmm7
+    punpcklbw xmm1, xmm7
 .xloop:
 
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
+    pmullw xmm0, xmm3
+    pmullw xmm1, xmm5
+    paddw  xmm0, xmm1
 
-	movq  xmm1, [r4]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
+    movq  xmm1, [r4]
+    punpcklbw xmm1, xmm7
+    movdqa xmm2, xmm1
+    pmullw xmm1, xmm4
+    paddw xmm0, xmm1
 
-	movq xmm1, [r4+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
+    movq xmm1, [r4+1]
+    punpcklbw xmm1, xmm7
+    movdqa xmm7, xmm1
+    pmullw xmm1, xmm6
+    paddw xmm0, xmm1
+    movdqa xmm1,xmm7
 
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
+    paddw xmm0, [h264_d0x20_sse2]
+    psrlw xmm0, 6
 
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
+    WELS_Zero xmm7
+    packuswb xmm0, xmm7
+    movq [r2], xmm0
 
-	movdqa xmm0, xmm2
+    movdqa xmm0, xmm2
 
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
+    lea r2, [r2 + r3]
+    lea r4, [r4 + r1]
 
-	dec r5
-	jnz near .xloop
+    dec r5
+    jnz near .xloop
 
-	POP_XMM
-	LOAD_6_PARA_POP
+    POP_XMM
+    LOAD_6_PARA_POP
 
-	ret
+    ret
 
 
 
@@ -219,19 +219,19 @@
 
 ;***********************************************************************
 ; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
-;						 int32_t iSrcStride,
+;                        int32_t iSrcStride,
 ;                        uint8_t *pDst,
 ;                        int32_t iDstStride,
 ;                        const uint8_t *pABCD,
-;					     int32_t iHeigh);
+;                        int32_t iHeigh);
 ;***********************************************************************
 WELS_EXTERN McChromaWidthEq8_ssse3
-	%assign  push_num 0
-	LOAD_6_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r5, r5d
 
     pxor      xmm7, xmm7
     movd   xmm5, [r4]
@@ -243,27 +243,27 @@
 
     sub r2, r3 ;sub esi, edi
     sub r2, r3
-	movdqa xmm7, [h264_d0x20_sse2]
+    movdqa xmm7, [h264_d0x20_sse2]
 
-	movdqu xmm0, [r0]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
+    movdqu xmm0, [r0]
+    movdqa xmm1, xmm0
+    psrldq xmm1, 1
+    punpcklbw xmm0, xmm1
 
 .hloop_chroma:
-	lea	r2, [r2+2*r3]
+    lea r2, [r2+2*r3]
 
-	movdqu xmm2, [r0+r1]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
+    movdqu xmm2, [r0+r1]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm4, xmm2
 
     pmaddubsw  xmm0, xmm5
     pmaddubsw  xmm2, xmm6
     paddw      xmm0, xmm2
     paddw      xmm0, xmm7
-	psrlw      xmm0, 6
+    psrlw      xmm0, 6
     packuswb   xmm0, xmm0
     movq       [r2],xmm0
 
@@ -278,16 +278,16 @@
     pmaddubsw  xmm2, xmm6
     paddw      xmm4, xmm2
     paddw      xmm4, xmm7
-	psrlw      xmm4, 6
+    psrlw      xmm4, 6
     packuswb   xmm4, xmm4
     movq       [r2+r3],xmm4
 
-	sub r5, 2
-	jnz .hloop_chroma
+    sub r5, 2
+    jnz .hloop_chroma
 
-	POP_XMM
-	LOAD_6_PARA_POP
+    POP_XMM
+    LOAD_6_PARA_POP
 
-	ret
+    ret
 
 
--- a/codec/common/x86/mc_luma.asm
+++ b/codec/common/x86/mc_luma.asm
@@ -52,13 +52,13 @@
 
 ALIGN 16
 h264_w0x10:
-	dw 16, 16, 16, 16
+    dw 16, 16, 16, 16
 ALIGN 16
 h264_w0x10_1:
-	dw 16, 16, 16, 16, 16, 16, 16, 16
+    dw 16, 16, 16, 16, 16, 16, 16, 16
 ALIGN 16
 h264_mc_hc_32:
-	dw 32, 32, 32, 32, 32, 32, 32, 32
+    dw 32, 32, 32, 32, 32, 32, 32, 32
 
 
 ;*******************************************************************************
@@ -72,55 +72,55 @@
 ;*******************************************************************************
 ; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
 ;                       int iSrcStride,
-;						uint8_t *pDst,
-;						int iDstStride,
-;						int iHeight)
+;                       uint8_t *pDst,
+;                       int iDstStride,
+;                       int iHeight)
 ;*******************************************************************************
 WELS_EXTERN McHorVer20WidthEq4_mmx
     %assign  push_num 0
     LOAD_5_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
 
-	sub r0, 2
-	WELS_Zero mm7
-	movq mm6, [h264_w0x10]
+    sub r0, 2
+    WELS_Zero mm7
+    movq mm6, [h264_w0x10]
 .height_loop:
-	movd mm0, [r0]
-	punpcklbw mm0, mm7
-	movd mm1, [r0+5]
-	punpcklbw mm1, mm7
-	movd mm2, [r0+1]
-	punpcklbw mm2, mm7
-	movd mm3, [r0+4]
-	punpcklbw mm3, mm7
-	movd mm4, [r0+2]
-	punpcklbw mm4, mm7
-	movd mm5, [r0+3]
-	punpcklbw mm5, mm7
+    movd mm0, [r0]
+    punpcklbw mm0, mm7
+    movd mm1, [r0+5]
+    punpcklbw mm1, mm7
+    movd mm2, [r0+1]
+    punpcklbw mm2, mm7
+    movd mm3, [r0+4]
+    punpcklbw mm3, mm7
+    movd mm4, [r0+2]
+    punpcklbw mm4, mm7
+    movd mm5, [r0+3]
+    punpcklbw mm5, mm7
 
-	paddw mm2, mm3
-	paddw mm4, mm5
-	psllw mm4, 2
-	psubw mm4, mm2
-	paddw mm0, mm1
-	paddw mm0, mm4
-	psllw mm4, 2
-	paddw mm0, mm4
-	paddw mm0, mm6
-	psraw mm0, 5
-	packuswb mm0, mm7
-	movd [r2], mm0
+    paddw mm2, mm3
+    paddw mm4, mm5
+    psllw mm4, 2
+    psubw mm4, mm2
+    paddw mm0, mm1
+    paddw mm0, mm4
+    psllw mm4, 2
+    paddw mm0, mm4
+    paddw mm0, mm6
+    psraw mm0, 5
+    packuswb mm0, mm7
+    movd [r2], mm0
 
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .height_loop
 
-	WELSEMMS
-	LOAD_5_PARA_POP
-	ret
+    WELSEMMS
+    LOAD_5_PARA_POP
+    ret
 
 ;*******************************************************************************
 ; Macros and other preprocessor constants
@@ -128,26 +128,26 @@
 
 
 %macro SSE_LOAD_8P 3
-	movq %1, %3
-	punpcklbw %1, %2
+    movq %1, %3
+    punpcklbw %1, %2
 %endmacro
 
 %macro FILTER_HV_W8 9
-	paddw	%1, %6
-	movdqa	%8, %3
-	movdqa	%7, %2
-	paddw	%1, [h264_w0x10_1]
-	paddw	%8, %4
-	paddw	%7, %5
-	psllw	%8, 2
-	psubw	%8, %7
-	paddw	%1, %8
-	psllw	%8, 2
-	paddw	%1, %8
-	psraw   %1, 5
-	WELS_Zero %8
-	packuswb %1, %8
-	movq    %9, %1
+    paddw   %1, %6
+    movdqa  %8, %3
+    movdqa  %7, %2
+    paddw   %1, [h264_w0x10_1]
+    paddw   %8, %4
+    paddw   %7, %5
+    psllw   %8, 2
+    psubw   %8, %7
+    paddw   %1, %8
+    psllw   %8, 2
+    paddw   %1, %8
+    psraw   %1, 5
+    WELS_Zero %8
+    packuswb %1, %8
+    movq    %9, %1
 %endmacro
 
 ;*******************************************************************************
@@ -159,192 +159,192 @@
 ;***********************************************************************
 ; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
 ;                       int16_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride
-;						int32_t iHeight
+;                       uint8_t *pDst,
+;                       int32_t iDstStride
+;                       int32_t iHeight
 ;                       )
 ;***********************************************************************
 WELS_EXTERN McHorVer22Width8HorFirst_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	pxor xmm7, xmm7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    pxor xmm7, xmm7
 
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
+    sub r0, r1              ;;;;;;;;need more 5 lines.
+    sub r0, r1
 
 .yloop_width_8:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    movdqa [r2], xmm0
 
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .yloop_width_8
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r4
+    jnz .yloop_width_8
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;*******************************************************************************
 ; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
 ;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
+;                                               uint8_t *pDst,
+;                                               int iDstStride,
+;                                               int iHeight,
 ;                      );
 ;*******************************************************************************
 WELS_EXTERN McHorVer20WidthEq8_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	lea r0, [r0-2]            ;pSrc -= 2;
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    lea r0, [r0-2]            ;pSrc -= 2;
 
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
+    pxor xmm7, xmm7
+    movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, xmm6
+    psraw xmm0, 5
 
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
+    packuswb xmm0, xmm7
+    movq [r2], xmm0
 
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
+    lea r2, [r2+r3]
+    lea r0, [r0+r1]
+    dec r4
+    jnz near .y_loop
 
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;*******************************************************************************
 ; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
 ;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
+;                                               uint8_t *pDst,
+;                                               int iDstStride,
+;                                               int iHeight,
 ;                      );
 ;*******************************************************************************
 WELS_EXTERN McHorVer20WidthEq16_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	lea r0, [r0-2]            ;pSrc -= 2;
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    lea r0, [r0-2]            ;pSrc -= 2;
 
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
+    pxor xmm7, xmm7
+    movdqa xmm6, [h264_w0x10_1]
 .y_loop:
 
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, xmm6
+    psraw xmm0, 5
+    packuswb xmm0, xmm7
+    movq [r2], xmm0
 
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0+8]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5+8]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1+8]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4+8]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2+8]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3+8]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2+8], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, xmm6
+    psraw xmm0, 5
+    packuswb xmm0, xmm7
+    movq [r2+8], xmm0
 
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
+    lea r2, [r2+r3]
+    lea r0, [r0+r1]
+    dec r4
+    jnz near .y_loop
 
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 
 ;*******************************************************************************
@@ -355,81 +355,81 @@
 ;                       int iHeight )
 ;*******************************************************************************
 WELS_EXTERN McHorVer02WidthEq8_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_5_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	sub r0, r1
-	sub r0, r1
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    sub r0, r1
+    sub r0, r1
 
-	WELS_Zero xmm7
+    WELS_Zero xmm7
 
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+    SSE_LOAD_8P xmm0, xmm7, [r0]
+    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm7, [r0]
+    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm7, [r0]
+    SSE_LOAD_8P xmm5, xmm7, [r0+r1]
 
 .start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r4
-	jz near .xx_exit
+    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm6, xmm7, [r0]
+    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r4
-	jz near .xx_exit
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm0, xmm1, [r0]
+    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r4
-	jz near .xx_exit
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm3, [r0]
+    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r4
-	jz near .xx_exit
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+    dec r4
+    jz near .xx_exit
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r4
-	jz near .xx_exit
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm5, [r0]
+    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+    dec r4
+    jz near .xx_exit
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+    jmp near .start
 
 .xx_exit:
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;***********************************************************************
 ; Code
@@ -440,725 +440,725 @@
 
 
 ;***********************************************************************
-; void McHorVer02Height9Or17_sse2(	const uint8_t *pSrc,
+; void McHorVer02Height9Or17_sse2(  const uint8_t *pSrc,
 ;                       int32_t iSrcStride,
 ;                       uint8_t *pDst,
 ;                       int32_t iDstStride,
-;						int32_t iWidth,
+;                       int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
 WELS_EXTERN McHorVer02Height9Or17_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
 
 %ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
+    push r12
+    push r13
+    push r14
+    mov  r12, r0
+    mov  r13, r2
+    mov  r14, r5
 %endif
 
-	shr r4, 3
-	sub r0, r1
-	sub r0, r1
+    shr r4, 3
+    sub r0, r1
+    sub r0, r1
 
 .xloop:
-	WELS_Zero xmm7
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+    WELS_Zero xmm7
+    SSE_LOAD_8P xmm0, xmm7, [r0]
+    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm7, [r0]
+    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm7, [r0]
+    SSE_LOAD_8P xmm5, xmm7, [r0+r1]
 
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	movdqa xmm0,xmm1
-	movdqa xmm1,xmm2
-	movdqa xmm2,xmm3
-	movdqa xmm3,xmm4
-	movdqa xmm4,xmm5
-	movdqa xmm5,xmm6
-	add r2, r3
-	sub r0, r1
+    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm6, xmm7, [r0]
+    movdqa xmm0,xmm1
+    movdqa xmm1,xmm2
+    movdqa xmm2,xmm3
+    movdqa xmm3,xmm4
+    movdqa xmm4,xmm5
+    movdqa xmm5,xmm6
+    add r2, r3
+    sub r0, r1
 
 .start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
+    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm6, xmm7, [r0]
+    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm0, xmm1, [r0]
+    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm2, xmm3, [r0]
+    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    SSE_LOAD_8P xmm4, xmm5, [r0]
+    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+    jmp near .start
 
 .x_loop_dec:
-	dec r4
-	jz  near .xx_exit
+    dec r4
+    jz  near .xx_exit
 %ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
+    mov r0, arg1
+    mov r2, arg3
+    mov r5, arg6
 %else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
+    mov r0, r12
+    mov r2, r13
+    mov r5, r14
 %endif
-	sub r0, r1
-	sub r0, r1
-	add r0, 8
-	add r2, 8
-	jmp near .xloop
+    sub r0, r1
+    sub r0, r1
+    add r0, 8
+    add r2, 8
+    jmp near .xloop
 
 .xx_exit:
 %ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
+    pop r14
+    pop r13
+    pop r12
 %endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 ;***********************************************************************
-; void McHorVer20Width9Or17_sse2(		const uint8_t *pSrc,
+; void McHorVer20Width9Or17_sse2(       const uint8_t *pSrc,
 ;                       int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						int32_t iWidth,
-;						int32_t iHeight
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;                       int32_t iWidth,
+;                       int32_t iHeight
 ;                      );
 ;***********************************************************************
 WELS_EXTERN McHorVer20Width9Or17_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-	sub r0, 2
-	pxor xmm7, xmm7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    sub r0, 2
+    pxor xmm7, xmm7
 
-	cmp r4, 9
-	jne near .width_17
+    cmp r4, 9
+    jne near .width_17
 
 .yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    paddw xmm0, [h264_w0x10_1]
+    psraw  xmm0, 5
+    packuswb xmm0, xmm0
+    movd [r2], xmm0
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+1], xmm2
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    paddw xmm2, [h264_w0x10_1]
+    psraw  xmm2, 5
+    packuswb xmm2, xmm2
+    movq [r2+1], xmm2
 
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_9
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 .width_17:
 .yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movq [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    paddw xmm0, [h264_w0x10_1]
+    psraw  xmm0, 5
+    packuswb xmm0, xmm0
+    movq [r2], xmm0
 
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0+8]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5+8]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1+8]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4+8]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2+8]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3+8]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2+8], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    paddw xmm0, [h264_w0x10_1]
+    psraw  xmm0, 5
+    packuswb xmm0, xmm0
+    movd [r2+8], xmm0
 
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6+8]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+9], xmm2
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    paddw xmm2, [h264_w0x10_1]
+    psraw  xmm2, 5
+    packuswb xmm2, xmm2
+    movq [r2+9], xmm2
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_17
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
-;							(const uint8_t *pSrc,
-;							int32_t iSrcStride,
-;							uint8_t * pTap,
-;							int32_t iTapStride,
-;							int32_t iWidth,int32_t iHeight);
+;                           (const uint8_t *pSrc,
+;                           int32_t iSrcStride,
+;                           uint8_t * pTap,
+;                           int32_t iTapStride,
+;                           int32_t iWidth,int32_t iHeight);
 ;***********************************************************************
 WELS_EXTERN McHorVer22HorFirst_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-	pxor xmm7, xmm7
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
+    pxor xmm7, xmm7
+    sub r0, r1              ;;;;;;;;need more 5 lines.
+    sub r0, r1
 
-	cmp r4, 9
-	jne near .width_17
+    cmp r4, 9
+    jne near .width_17
 
 .yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    movd [r2], xmm0
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+2], xmm2
-	movhps [r2+2+8], xmm2
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    movq [r2+2], xmm2
+    movhps [r2+2+8], xmm2
 
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_9
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 .width_17:
 .yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3]
+    punpcklbw xmm5, xmm7
 
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
+    paddw xmm2, xmm3
+    paddw xmm4, xmm5
+    psllw xmm4, 2
+    psubw xmm4, xmm2
+    paddw xmm0, xmm1
+    paddw xmm0, xmm4
+    psllw xmm4, 2
+    paddw xmm0, xmm4
+    movdqa [r2], xmm0
 
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
+    movq xmm0, [r0+8]
+    punpcklbw xmm0, xmm7
+    movq xmm1, [r0+5+8]
+    punpcklbw xmm1, xmm7
+    movq xmm2, [r0+1+8]
+    punpcklbw xmm2, xmm7
+    movq xmm3, [r0+4+8]
+    punpcklbw xmm3, xmm7
+    movq xmm4, [r0+2+8]
+    punpcklbw xmm4, xmm7
+    movq xmm5, [r0+3+8]
+    punpcklbw xmm5, xmm7
 
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2+16], xmm0
+    movdqa xmm7, xmm2
+    paddw   xmm7, xmm3
+    movdqa xmm6, xmm4
+    paddw   xmm6, xmm5
+    psllw xmm6, 2
+    psubw xmm6, xmm7
+    paddw xmm0, xmm1
+    paddw xmm0, xmm6
+    psllw xmm6, 2
+    paddw xmm0, xmm6
+    movd [r2+16], xmm0
 
 
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
+    pxor  xmm7, xmm7
+    movq xmm0, [r0+6+8]
+    punpcklbw xmm0, xmm7
 
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+18], xmm2
-	movhps [r2+18+8], xmm2
+    paddw xmm4, xmm1
+    paddw xmm5, xmm3
+    psllw xmm5, 2
+    psubw xmm5, xmm4
+    paddw xmm2, xmm0
+    paddw xmm2, xmm5
+    psllw xmm5, 2
+    paddw xmm2, xmm5
+    movq [r2+18], xmm2
+    movhps [r2+18+8], xmm2
 
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    add r0, r1
+    add r2, r3
+    dec r5
+    jnz .yloop_width_17
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 
 %macro FILTER_VER 9
-	paddw  %1, %6
-	movdqa %7, %2
-	movdqa %8, %3
+    paddw  %1, %6
+    movdqa %7, %2
+    movdqa %8, %3
 
 
-	paddw %7, %5
-	paddw %8, %4
+    paddw %7, %5
+    paddw %8, %4
 
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %1, %8
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %8, %1
-	paddw  %8, [h264_mc_hc_32]
-	psraw   %8, 6
-	packuswb %8, %8
-	movq %9, %8
+    psubw  %1, %7
+    psraw   %1, 2
+    paddw  %1, %8
+    psubw  %1, %7
+    psraw   %1, 2
+    paddw  %8, %1
+    paddw  %8, [h264_mc_hc_32]
+    psraw   %8, 6
+    packuswb %8, %8
+    movq %9, %8
 %endmacro
 ;***********************************************************************
 ;void McHorVer22Width8VerLastAlign_sse2(
-;											const uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
+;                                           const uint8_t *pTap,
+;                                           int32_t iTapStride,
+;                                           uint8_t * pDst,
+;                                           int32_t iDstStride,
+;                                           int32_t iWidth,
+;                                           int32_t iHeight);
 ;***********************************************************************
 
 WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
 %ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
+    push r12
+    push r13
+    push r14
+    mov  r12, r0
+    mov  r13, r2
+    mov  r14, r5
 %endif
 
-	shr r4, 3
+    shr r4, 3
 
 .width_loop:
-	movdqa xmm0, [r0]
-	movdqa xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	movdqa xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	movdqa xmm5, [r0+r1]
+    movdqa xmm0, [r0]
+    movdqa xmm1, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    movdqa xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    movdqa xmm5, [r0+r1]
 
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
+    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    lea r0, [r0+2*r1]
+    movdqa xmm6, [r0]
 
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
+    movdqa xmm0, xmm1
+    movdqa xmm1, xmm2
+    movdqa xmm2, xmm3
+    movdqa xmm3, xmm4
+    movdqa xmm4, xmm5
+    movdqa xmm5, xmm6
 
-	add r2, r3
-	sub r0, r1
+    add r2, r3
+    sub r0, r1
 
 .start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
+    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm6, [r0]
+    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqa xmm7, [r0+r1]
+    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm0, [r0]
+    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqa xmm1, [r0+r1]
+    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm2, [r0]
+    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqa xmm3, [r0+r1]
+    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqa xmm4, [r0]
+    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqa xmm5, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    movdqa xmm5, [r0+r1]
+    jmp near .start
 
 .x_loop_dec:
-	dec r4
-	jz near .exit
+    dec r4
+    jz near .exit
 %ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
+    mov r0, arg1
+    mov r2, arg3
+    mov r5, arg6
 %else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
+    mov r0, r12
+    mov r2, r13
+    mov r5, r14
 %endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
+    add r0, 16
+    add r2, 8
+    jmp .width_loop
 
 .exit:
 %ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
+    pop r14
+    pop r13
+    pop r12
 %endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
 
 ;***********************************************************************
 ;void McHorVer22Width8VerLastUnAlign_sse2(
-;											const uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
+;                                           const uint8_t *pTap,
+;                                           int32_t iTapStride,
+;                                           uint8_t * pDst,
+;                                           int32_t iDstStride,
+;                                           int32_t iWidth,
+;                                           int32_t iHeight);
 ;***********************************************************************
 
 WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-	%assign  push_num 0
+    %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    SIGN_EXTENSION  r5, r5d
 %ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
+    push r12
+    push r13
+    push r14
+    mov  r12, r0
+    mov  r13, r2
+    mov  r14, r5
 %endif
-	shr r4, 3
+    shr r4, 3
 
 .width_loop:
-	movdqu xmm0, [r0]
-	movdqu xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	movdqu xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	movdqu xmm5, [r0+r1]
+    movdqu xmm0, [r0]
+    movdqu xmm1, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    movdqu xmm3, [r0+r1]
+    lea r0, [r0+2*r1]
+    movdqu xmm4, [r0]
+    movdqu xmm5, [r0+r1]
 
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
+    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    lea r0, [r0+2*r1]
+    movdqu xmm6, [r0]
 
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
+    movdqa xmm0, xmm1
+    movdqa xmm1, xmm2
+    movdqa xmm2, xmm3
+    movdqa xmm3, xmm4
+    movdqa xmm4, xmm5
+    movdqa xmm5, xmm6
 
-	add r2, r3
-	sub r0, r1
+    add r2, r3
+    sub r0, r1
 
 .start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
+    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm6, [r0]
+    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqu xmm7, [r0+r1]
+    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm0, [r0]
+    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqu xmm1, [r0+r1]
+    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
+    lea r2, [r2+2*r3]
+    movdqu xmm3, [r0+r1]
+    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
+    lea r0, [r0+2*r1]
+    movdqu xmm4, [r0]
+    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+    dec r5
+    jz near .x_loop_dec
 
-	lea r2, [r2+2*r3]
-	movdqu xmm5, [r0+r1]
-	jmp near .start
+    lea r2, [r2+2*r3]
+    movdqu xmm5, [r0+r1]
+    jmp near .start
 
 .x_loop_dec:
-	dec r4
-	jz near .exit
+    dec r4
+    jz near .exit
 %ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
+    mov r0, arg1
+    mov r2, arg3
+    mov r5, arg6
 %else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
+    mov r0, r12
+    mov r2, r13
+    mov r5, r14
 %endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
+    add r0, 16
+    add r2, 8
+    jmp .width_loop
 
 .exit:
 %ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
+    pop r14
+    pop r13
+    pop r12
 %endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
--- a/codec/common/x86/satd_sad.asm
+++ b/codec/common/x86/satd_sad.asm
@@ -77,77 +77,77 @@
 ;
 ;***********************************************************************
 %macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
+    pxor %1, %1
+    pcmpeqw %2, %2
+    psubw %1, %2
 %endmacro
 
-%macro  SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
+%macro SSE2_SumWHorizon1 2
+    movdqa      %2, %1
+    psrldq      %2, 8
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    paddusw     %1, %2
 %endmacro
 
 %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
+    SSE2_SumSub %1, %2, %5
+    SSE2_SumSub %3, %4, %5
+    SSE2_SumSub %2, %4, %5
+    SSE2_SumSub %1, %3, %5
 %endmacro
 
 %macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
+    WELS_AbsW %1, %3
+    WELS_AbsW %2, %3
+    WELS_AbsW %4, %6
+    WELS_AbsW %5, %6
+    paddusw       %1, %2
+    paddusw       %4, %5
+    paddusw       %7, %1
+    paddusw       %7, %4
 %endmacro
 
-%macro  SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%macro SSE2_SumWHorizon 3
+    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
+    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
+    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
+    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
+    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
+    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
+    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
 %endmacro
 
 %macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+    lea                 r0, [r0+2*r1]
+    lea                 r2, [r2+2*r3]
+    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
 
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 
-	lea					r0,    [r0+2*r1]
-    lea					r2,    [r2+2*r3]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+    lea                 r0,    [r0+2*r1]
+    lea                 r2,    [r2+2*r3]
+    SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+    lea                 r0, [r0+2*r1]
+    lea                 r2, [r2+2*r3]
+    SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+    SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
 
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+    SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 %endmacro
 
 ;***********************************************************************
@@ -156,11 +156,11 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
     movd      xmm0, [r0]
     movd      xmm1, [r0+r1]
     lea       r0 , [r0+2*r1]
@@ -199,14 +199,14 @@
     punpcklwd      xmm0, xmm4
     punpckhwd      xmm4, xmm2
 
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+    SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+    SSE2_XSawp     qdq, xmm0, xmm3, xmm5
 
     movdqa         xmm7, xmm0
     paddw          xmm0, xmm5
     psubw          xmm7, xmm5
 
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+    SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
 
     movdqa         xmm2, xmm0
     paddw          xmm0, xmm1
@@ -214,15 +214,15 @@
 
     WELS_AbsW  xmm0, xmm3
     paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
+    WELS_AbsW  xmm2, xmm4
     paddusw        xmm6, xmm2
     SSE2_SumWHorizon1  xmm6, xmm4
-	movd           retrd,  xmm6
+    movd           retrd,  xmm6
     and            retrd,  0xffff
     shr            retrd,  1
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
  ;***********************************************************************
  ;
@@ -230,20 +230,20 @@
  ;
  ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm6,   xmm6
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
     SSE2_GetSatd8x8
     psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
  ;***********************************************************************
  ;
@@ -251,25 +251,25 @@
  ;
  ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x16_sse2
-	 %assign  push_num 0
-	 LOAD_4_PARA
-	 PUSH_XMM 8
-	 SIGN_EXTENSION r1, r1d
-	 SIGN_EXTENSION r3, r3d
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
 
-	 SSE2_GetSatd8x8
-     lea    r0,    [r0+2*r1]
-     lea    r2,    [r2+2*r3]
-	 SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSatd8x8
 
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    retrd,   xmm6
-	 POP_XMM
-	 LOAD_4_PARA_POP
-	 ret
+    psrlw   xmm6,  1
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -277,30 +277,30 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    push r0
+    push r2
+    pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
 
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
 
-	pop r2
-	pop r0
+    pop r2
+    pop r0
     add    r0,    8
     add    r2,    8
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
 
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    psrlw   xmm6,  1
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -308,38 +308,38 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x16_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    push r0
+    push r2
+    pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
 
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSatd8x8
 
-	pop r2
-	pop r0
-	add    r0,    8
-	add    r2,    8
+    pop r2
+    pop r0
+    add    r0,    8
+    add    r2,    8
 
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
+    SSE2_GetSatd8x8
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSatd8x8
 
  ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
     psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    SSE2_SumWHorizon   xmm6,xmm4,xmm7
+    movd    retrd,   xmm6
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -355,9 +355,9 @@
 
 
 %macro SSE_DB_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubb %1, %2
+    pxor %1, %1
+    pcmpeqw %2, %2
+    psubb %1, %2
 %endmacro
 
 ;***********************************************************************
@@ -369,668 +369,668 @@
 WELS_EXTERN WelsSampleSatdThree4x4_sse2
 
 %ifdef X86_32
-	push r3
-	push r4
-	push r5
-	push r6
-	%assign  push_num 4
+    push r3
+    push r4
+    push r5
+    push r6
+    %assign  push_num 4
 %else
-	%assign  push_num 0
+    %assign  push_num 0
 %endif
-	PUSH_XMM 8
+    PUSH_XMM 8
 
-	mov  r2, arg3
-	mov  r3, arg4
-	SIGN_EXTENSION r3, r3d
+    mov  r2, arg3
+    mov  r3, arg4
+    SIGN_EXTENSION r3, r3d
 
-	; load source 4x4 samples and Hadamard transform
-	movd      xmm0, [r2]
-	movd      xmm1, [r2+r3]
-	lea       r2 , [r2+2*r3]
-	movd      xmm2, [r2]
-	movd      xmm3, [r2+r3]
-	punpckldq xmm0, xmm2
-	punpckldq xmm1, xmm3
+    ; load source 4x4 samples and Hadamard transform
+    movd      xmm0, [r2]
+    movd      xmm1, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm2, [r2]
+    movd      xmm3, [r2+r3]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
 
-	pxor      xmm6, xmm6
-	punpcklbw xmm0, xmm6
-	punpcklbw xmm1, xmm6
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
 
-	movdqa    xmm2, xmm0
-	paddw     xmm0, xmm1
-	psubw     xmm2, xmm1
-	SSE2_XSawp  qdq, xmm0, xmm2, xmm3
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp  qdq, xmm0, xmm2, xmm3
 
-	movdqa    xmm4, xmm0
-	paddw     xmm0, xmm3
-	psubw     xmm4, xmm3
+    movdqa    xmm4, xmm0
+    paddw     xmm0, xmm3
+    psubw     xmm4, xmm3
 
-	movdqa    xmm2, xmm0
-	punpcklwd xmm0, xmm4
-	punpckhwd xmm4, xmm2
+    movdqa    xmm2, xmm0
+    punpcklwd xmm0, xmm4
+    punpckhwd xmm4, xmm2
 
-	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
+    SSE2_XSawp  dq,  xmm0, xmm4, xmm3
+    SSE2_XSawp  qdq, xmm0, xmm3, xmm5
 
-	movdqa    xmm7, xmm0
-	paddw     xmm0, xmm5
-	psubw     xmm7, xmm5
+    movdqa    xmm7, xmm0
+    paddw     xmm0, xmm5
+    psubw     xmm7, xmm5
 
-	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
+    SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
 
-	; Hadamard transform results are saved in xmm0 and xmm2
-	movdqa    xmm2, xmm0
-	paddw     xmm0, xmm1
-	psubw     xmm2, xmm1
+    ; Hadamard transform results are saved in xmm0 and xmm2
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
 
-	;load top boundary samples: [a b c d]
-	mov r0, arg1
-	mov r1, arg2
-	SIGN_EXTENSION r1, r1d
-	sub r0, r1
+    ;load top boundary samples: [a b c d]
+    mov r0, arg1
+    mov r1, arg2
+    SIGN_EXTENSION r1, r1d
+    sub r0, r1
 %ifdef UNIX64
-	push r4
-	push r5
+    push r4
+    push r5
 %endif
 
-	movzx     r2d,  byte [r0]
-	movzx     r3d,  byte [r0+1]
-	movzx     r4d,  byte [r0+2]
-	movzx     r5d,  byte [r0+3]
+    movzx     r2d,  byte [r0]
+    movzx     r3d,  byte [r0+1]
+    movzx     r4d,  byte [r0+2]
+    movzx     r5d,  byte [r0+3]
 
-	; get the transform results of top boundary samples: [a b c d]
-	add       r3d, r2d ; r3d = a + b
-	add       r5d, r4d ; r5d = c + d
-	add       r2d, r2d ; r2d = a + a
-	add       r4d, r4d ; r4d = c + c
-	sub       r2d, r3d ; r2d = a + a - a - b = a - b
-	sub       r4d, r5d ; r4d = c + c - c - d = c - d
-	add       r5d, r3d ; r5d = (a + b) + (c + d)
-	add       r3d, r3d
-	sub       r3d, r5d ; r3d = (a + b) - (c + d)
-	add       r4d, r2d ; r4d = (a - b) + (c - d)
-	add       r2d, r2d
-	sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+    ; get the transform results of top boundary samples: [a b c d]
+    add       r3d, r2d ; r3d = a + b
+    add       r5d, r4d ; r5d = c + d
+    add       r2d, r2d ; r2d = a + a
+    add       r4d, r4d ; r4d = c + c
+    sub       r2d, r3d ; r2d = a + a - a - b = a - b
+    sub       r4d, r5d ; r4d = c + c - c - d = c - d
+    add       r5d, r3d ; r5d = (a + b) + (c + d)
+    add       r3d, r3d
+    sub       r3d, r5d ; r3d = (a + b) - (c + d)
+    add       r4d, r2d ; r4d = (a - b) + (c - d)
+    add       r2d, r2d
+    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
 
-	movdqa    xmm6, xmm0
-	movdqa    xmm7, xmm2
-	movd      xmm5, r5d ; store the edi for DC mode
-	pxor      xmm3, xmm3
-	pxor      xmm4, xmm4
-	pinsrw    xmm3, r5d, 0
-	pinsrw    xmm3, r4d, 4
-	psllw     xmm3, 2
-	pinsrw    xmm4, r3d, 0
-	pinsrw    xmm4, r2d, 4
-	psllw     xmm4, 2
+    movdqa    xmm6, xmm0
+    movdqa    xmm7, xmm2
+    movd      xmm5, r5d ; store the edi for DC mode
+    pxor      xmm3, xmm3
+    pxor      xmm4, xmm4
+    pinsrw    xmm3, r5d, 0
+    pinsrw    xmm3, r4d, 4
+    psllw     xmm3, 2
+    pinsrw    xmm4, r3d, 0
+    pinsrw    xmm4, r2d, 4
+    psllw     xmm4, 2
 
-	; get the satd of H
-	psubw     xmm0, xmm3
-	psubw     xmm2, xmm4
+    ; get the satd of H
+    psubw     xmm0, xmm3
+    psubw     xmm2, xmm4
 
-	WELS_AbsW  xmm0, xmm1
-	WELS_AbsW  xmm2, xmm1
-	paddusw        xmm0, xmm2
-	SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0
+    WELS_AbsW  xmm0, xmm1
+    WELS_AbsW  xmm2, xmm1
+    paddusw        xmm0, xmm2
+    SSE2_SumWHorizon1  xmm0, xmm1 ; satd of V is stored in xmm0
 
-	;load left boundary samples: [a b c d]'
-	add r0, r1
+    ;load left boundary samples: [a b c d]'
+    add r0, r1
 
-	movzx     r2d,  byte [r0-1]
-	movzx     r3d,  byte [r0+r1-1]
-	lea       r0 , [r0+2*r1]
-	movzx     r4d,  byte [r0-1]
-	movzx     r5d,  byte [r0+r1-1]
+    movzx     r2d,  byte [r0-1]
+    movzx     r3d,  byte [r0+r1-1]
+    lea       r0 , [r0+2*r1]
+    movzx     r4d,  byte [r0-1]
+    movzx     r5d,  byte [r0+r1-1]
 
-	; get the transform results of left boundary samples: [a b c d]'
-	add       r3d, r2d ; r3d = a + b
-	add       r5d, r4d ; r5d = c + d
-	add       r2d, r2d ; r2d = a + a
-	add       r4d, r4d ; r4d = c + c
-	sub       r2d, r3d ; r2d = a + a - a - b = a - b
-	sub       r4d, r5d ; r4d = c + c - c - d = c - d
-	add       r5d, r3d ; r5d = (a + b) + (c + d)
-	add       r3d, r3d
-	sub       r3d, r5d ; r3d = (a + b) - (c + d)
-	add       r4d, r2d ; r4d = (a - b) + (c - d)
-	add       r2d, r2d
-	sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
+    ; get the transform results of left boundary samples: [a b c d]'
+    add       r3d, r2d ; r3d = a + b
+    add       r5d, r4d ; r5d = c + d
+    add       r2d, r2d ; r2d = a + a
+    add       r4d, r4d ; r4d = c + c
+    sub       r2d, r3d ; r2d = a + a - a - b = a - b
+    sub       r4d, r5d ; r4d = c + c - c - d = c - d
+    add       r5d, r3d ; r5d = (a + b) + (c + d)
+    add       r3d, r3d
+    sub       r3d, r5d ; r3d = (a + b) - (c + d)
+    add       r4d, r2d ; r4d = (a - b) + (c - d)
+    add       r2d, r2d
+    sub       r2d, r4d ; r2d = (a - b) - (c - d) ; [r5d r3d r2d r4d]
 
-	; store the transform results in xmm3
-	movd      xmm3, r5d
-	pinsrw    xmm3, r3d, 1
-	pinsrw    xmm3, r2d, 2
-	pinsrw    xmm3, r4d, 3
-	psllw     xmm3, 2
+    ; store the transform results in xmm3
+    movd      xmm3, r5d
+    pinsrw    xmm3, r3d, 1
+    pinsrw    xmm3, r2d, 2
+    pinsrw    xmm3, r4d, 3
+    psllw     xmm3, 2
 
-	; get the satd of V
-	movdqa    xmm2, xmm6
-	movdqa    xmm4, xmm7
-	psubw     xmm2, xmm3
-	WELS_AbsW  xmm2, xmm1
-	WELS_AbsW  xmm4, xmm1
-	paddusw        xmm2, xmm4
-	SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2
+    ; get the satd of V
+    movdqa    xmm2, xmm6
+    movdqa    xmm4, xmm7
+    psubw     xmm2, xmm3
+    WELS_AbsW  xmm2, xmm1
+    WELS_AbsW  xmm4, xmm1
+    paddusw        xmm2, xmm4
+    SSE2_SumWHorizon1  xmm2, xmm1 ; satd of H is stored in xmm2
 
-	; DC result is stored in xmm1
-	add       r5d, 4
-	movd      xmm1, r5d
-	paddw     xmm1, xmm5
-	psrlw     xmm1, 3
-	movdqa    xmm5, xmm1
-	psllw     xmm1, 4
+    ; DC result is stored in xmm1
+    add       r5d, 4
+    movd      xmm1, r5d
+    paddw     xmm1, xmm5
+    psrlw     xmm1, 3
+    movdqa    xmm5, xmm1
+    psllw     xmm1, 4
 
-	; get the satd of DC
-	psubw          xmm6, xmm1
-	WELS_AbsW  xmm6, xmm1
-	WELS_AbsW  xmm7, xmm1
-	paddusw        xmm6, xmm7
-	SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
+    ; get the satd of DC
+    psubw          xmm6, xmm1
+    WELS_AbsW  xmm6, xmm1
+    WELS_AbsW  xmm7, xmm1
+    paddusw        xmm6, xmm7
+    SSE2_SumWHorizon1  xmm6, xmm1 ; satd of DC is stored in xmm6
 %ifdef UNIX64
-	pop r5
-	pop r4
+    pop r5
+    pop r4
 %endif
-	; comparing order: DC H V
+    ; comparing order: DC H V
 
-	mov  r4, arg5
-	movd      r2d, xmm6
-	movd      r3d, xmm2
-	movd      r6d, xmm0
+    mov  r4, arg5
+    movd      r2d, xmm6
+    movd      r3d, xmm2
+    movd      r6d, xmm0
 
-	and       r2d, 0xffff
-	shr       r2d, 1
-	and       r3d, 0xffff
-	shr       r3d, 1
-	and       r6d, 0xffff
-	shr       r6d, 1
-	add       r2d, dword arg7
-	add       r3d, dword arg8
-	add       r6d, dword arg9
-	cmp       r2w, r3w
-	jg near   not_dc
-	cmp       r2w, r6w
-	jg near   not_dc_h
+    and       r2d, 0xffff
+    shr       r2d, 1
+    and       r3d, 0xffff
+    shr       r3d, 1
+    and       r6d, 0xffff
+    shr       r6d, 1
+    add       r2d, dword arg7
+    add       r3d, dword arg8
+    add       r6d, dword arg9
+    cmp       r2w, r3w
+    jg near   not_dc
+    cmp       r2w, r6w
+    jg near   not_dc_h
 
-	; for DC mode
-	movd      r3d, xmm5
-	imul      r3d, 0x01010101
-	movd	  xmm5, r3d
-	pshufd    xmm5, xmm5, 0
-	movdqa    [r4], xmm5
-	mov r5, arg6
-	mov       dword [r5], 0x02
-	mov retrd, r2d
-	POP_XMM
+    ; for DC mode
+    movd      r3d, xmm5
+    imul      r3d, 0x01010101
+    movd      xmm5, r3d
+    pshufd    xmm5, xmm5, 0
+    movdqa    [r4], xmm5
+    mov r5, arg6
+    mov       dword [r5], 0x02
+    mov retrd, r2d
+    POP_XMM
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	ret
+    ret
 
 not_dc:
-	cmp       r3w, r6w
-	jg near   not_dc_h
+    cmp       r3w, r6w
+    jg near   not_dc_h
 
-	; for H mode
-	SSE_DB_1_2REG  xmm6, xmm7
-	sub        r0, r1
-	sub        r0, r1
-	movzx      r6d,  byte [r0-1]
-	movd       xmm0, r6d
-	pmuludq    xmm0, xmm6
+    ; for H mode
+    SSE_DB_1_2REG  xmm6, xmm7
+    sub        r0, r1
+    sub        r0, r1
+    movzx      r6d,  byte [r0-1]
+    movd       xmm0, r6d
+    pmuludq    xmm0, xmm6
 
-	movzx     r6d,  byte [r0+r1-1]
-	movd      xmm1, r6d
-	pmuludq   xmm1, xmm6
-	punpckldq xmm0, xmm1
+    movzx     r6d,  byte [r0+r1-1]
+    movd      xmm1, r6d
+    pmuludq   xmm1, xmm6
+    punpckldq xmm0, xmm1
 
-	lea       r0,	[r0+r1*2]
-	movzx	  r6d,	byte [r0-1]
-	movd	  xmm2,	r6d
-	pmuludq   xmm2, xmm6
+    lea       r0,   [r0+r1*2]
+    movzx     r6d,  byte [r0-1]
+    movd      xmm2, r6d
+    pmuludq   xmm2, xmm6
 
-	movzx	  r6d,	byte [r0+r1-1]
-	movd	  xmm3,	r6d
-	pmuludq   xmm3, xmm6
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
+    movzx     r6d,  byte [r0+r1-1]
+    movd      xmm3, r6d
+    pmuludq   xmm3, xmm6
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
 
-	movdqa	  [r4],xmm0
+    movdqa    [r4],xmm0
 
-	mov       retrd, r3d
-	mov r5, arg6
-	mov       dword [r5], 0x01
-	POP_XMM
+    mov       retrd, r3d
+    mov r5, arg6
+    mov       dword [r5], 0x01
+    POP_XMM
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	ret
+    ret
 not_dc_h:
-	sub        r0, r1
-	sub        r0, r1
-	sub        r0, r1
-	movd	  xmm0,	[r0]
-	pshufd	  xmm0,	xmm0, 0
-	movdqa	  [r4],xmm0
-	mov       retrd, r6d
-	mov r5, arg6
-	mov       dword [r5], 0x00
-	POP_XMM
+    sub        r0, r1
+    sub        r0, r1
+    sub        r0, r1
+    movd      xmm0, [r0]
+    pshufd    xmm0, xmm0, 0
+    movdqa    [r4],xmm0
+    mov       retrd, r6d
+    mov r5, arg6
+    mov       dword [r5], 0x00
+    POP_XMM
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	ret
+    ret
 
 
 %macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
+    pmaddubsw    %1, xmm5
+    movdqa       %2, %1
+    pmaddwd      %1, xmm7
+    pmaddwd      %2, xmm6
+    movdqa       %3, %1
+    punpckldq    %1, %2
+    punpckhdq    %2, %3
+    movdqa       %3, %1
+    punpcklqdq   %1, %2
+    punpckhqdq   %3, %2
+    paddd        xmm4, %1 ;for dc
+    paddd        xmm4, %3 ;for dc
+    packssdw     %1, %3
+    psllw        %1, 2
 %endmacro
 %macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
+    pmaddubsw    %1, xmm5
+    movdqa       %2, %1
+    pmaddwd      %1, xmm7
+    pmaddwd      %2, xmm6
+    movdqa       %3, %1
+    punpckldq    %1, %2
+    punpckhdq    %2, %3
+    movdqa       %3, %1
+    punpcklqdq   %1, %2
+    punpckhqdq   %3, %2
 ;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
+;    paddd        xmm4, %3 ;for dc
+    movdqa       %4, %1
+    punpcklqdq   %4, %3
+    packssdw     %1, %3
+    psllw        %1, 2
 %endmacro
 
 %macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [r2]
-	movq        xmm1,   [r2+r3]
-	lea         r2,    [r2+2*r3]
-	movq        xmm2,   [r2]
-	movq        xmm3,   [r2+r3]
-	lea         r2,    [r2+2*r3]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
+    pxor        xmm7,   xmm7
+    movq        xmm0,   [r2]
+    movq        xmm1,   [r2+r3]
+    lea         r2,    [r2+2*r3]
+    movq        xmm2,   [r2]
+    movq        xmm3,   [r2+r3]
+    lea         r2,    [r2+2*r3]
+    punpcklbw   xmm0,   xmm7
+    punpcklbw   xmm1,   xmm7
+    punpcklbw   xmm2,   xmm7
+    punpcklbw   xmm3,   xmm7
+    SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+    SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+    SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+    ;doesn't need another transpose
 %endmacro
 
 %macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2],   0
-	pinsrw      xmm0,   word[r6+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2+2],  0
-	pinsrw      xmm0,   word[r6+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2+4],  0
-	pinsrw      xmm0,   word[r6+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[r6+%2+6],  0
-	pinsrw      xmm0,   word[r6+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2],   0
+    pinsrw      xmm0,   word[r6+%2+8], 4
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2+2],  0
+    pinsrw      xmm0,   word[r6+%2+10], 4
+    psubsw      xmm0,   xmm1
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2+4],  0
+    pinsrw      xmm0,   word[r6+%2+12], 4
+    psubsw      xmm0,   xmm3
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
+    pxor        xmm0,   xmm0
+    pinsrw      xmm0,   word[r6+%2+6],  0
+    pinsrw      xmm0,   word[r6+%2+14], 4
+    psubsw      xmm0,   xmm2
+    pabsw       xmm0,   xmm0
+    paddw       xmm4,   xmm0
 %endmacro
 %macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [r6+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
+    movq        xmm0,   [r6+%3+8*%1]
+    punpcklqdq  xmm0,   xmm0
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm5,   xmm0
+    pabsw       xmm1,   xmm1
+    pabsw       xmm2,   xmm2
+    pabsw       xmm3,   xmm3
+    paddw       xmm2,   xmm1;for DC
+    paddw       xmm2,   xmm3;for DC
+    paddw       xmm5,   xmm2
 %endmacro
 %macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
+    pxor        xmm0,   xmm0
+    movq2dq     xmm0,   mm4
+    punpcklqdq  xmm0,   xmm0
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm6,   xmm0
+    paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [r6+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
+    shl         %1,     4
+    movdqa      xmm0,   [r6+32+%1]
+    psubsw      xmm0,   xmm7
+    pabsw       xmm0,   xmm0
+    paddw       xmm6,   xmm0
+    paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
+    SSE41_GetX38x4SatdDec
+    SSE41_GetX38x4SatdV   %1, %2
+    SSE41_GetX38x4SatdH   %1, %2, 32
+    SSE41_I16X16GetX38x4SatdDC
 %endmacro
 %macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
+    SSE41_GetX38x4SatdDec
+    SSE41_GetX38x4SatdV   %1, %2
+    SSE41_GetX38x4SatdH   %1, %2, 16
+    SSE41_ChromaGetX38x4SatdDC %1
 %endmacro
 %macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
+    pmaddwd     %1, %2
+    movhlps     %3, %1
+    paddd       %1, %3
+    pshuflw     %3, %1,0Eh
+    paddd       %1, %3
 %endmacro
 
 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-	%assign  push_num 0
-	LOAD_7_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	SIGN_EXTENSION r5, r5d
+    %assign  push_num 0
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
 
 %ifndef X86_32
-	push r12
-	mov  r12, r2
+    push r12
+    mov  r12, r2
 %endif
 
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         r0,    r1
-	movdqu		xmm0,   [r0]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [r6],  xmm0 ;V
-	movdqa      [r6+16], xmm1
-	add         r0,    r1
-	pinsrb      xmm0,   byte[r0-1], 0
-	pinsrb      xmm0,   byte[r0+r1-1], 1
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     2
-	pinsrb      xmm0,   byte[r0+r1-1], 3
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     4
-	pinsrb      xmm0,   byte[r0+r1-1], 5
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     6
-	pinsrb      xmm0,   byte[r0+r1-1], 7
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     8
-	pinsrb      xmm0,   byte[r0+r1-1], 9
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     10
-	pinsrb      xmm0,   byte[r0+r1-1], 11
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     12
-	pinsrb      xmm0,   byte[r0+r1-1], 13
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     14
-	pinsrb      xmm0,   byte[r0+r1-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [r6+32], xmm0 ;H
-	movdqa      [r6+48], xmm1
-	movd        r0d,    xmm4 ;dc
-	add         r0d,    16   ;(sum+16)
-	shr         r0d,    5    ;((sum+16)>>5)
-	shl         r0d,    4    ;
-	movd        mm4,    r0d  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
+    pxor        xmm4,   xmm4
+    movdqa      xmm5,   [HSumSubDB1]
+    movdqa      xmm6,   [HSumSubDW1]
+    movdqa      xmm7,   [PDW1]
+    sub         r0,    r1
+    movdqu      xmm0,   [r0]
+    movhlps     xmm1,   xmm0
+    punpcklqdq  xmm0,   xmm0
+    punpcklqdq  xmm1,   xmm1
+    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+    movdqa      [r6],  xmm0 ;V
+    movdqa      [r6+16], xmm1
+    add         r0,    r1
+    pinsrb      xmm0,   byte[r0-1], 0
+    pinsrb      xmm0,   byte[r0+r1-1], 1
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     2
+    pinsrb      xmm0,   byte[r0+r1-1], 3
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     4
+    pinsrb      xmm0,   byte[r0+r1-1], 5
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     6
+    pinsrb      xmm0,   byte[r0+r1-1], 7
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     8
+    pinsrb      xmm0,   byte[r0+r1-1], 9
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     10
+    pinsrb      xmm0,   byte[r0+r1-1], 11
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     12
+    pinsrb      xmm0,   byte[r0+r1-1], 13
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     14
+    pinsrb      xmm0,   byte[r0+r1-1], 15
+    movhlps     xmm1,   xmm0
+    punpcklqdq  xmm0,   xmm0
+    punpcklqdq  xmm1,   xmm1
+    SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+    SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+    movdqa      [r6+32], xmm0 ;H
+    movdqa      [r6+48], xmm1
+    movd        r0d,    xmm4 ;dc
+    add         r0d,    16   ;(sum+16)
+    shr         r0d,    5    ;((sum+16)>>5)
+    shl         r0d,    4    ;
+    movd        mm4,    r0d  ; mm4 copy DC
+    pxor        xmm4,   xmm4 ;V
+    pxor        xmm5,   xmm5 ;H
+    pxor        xmm6,   xmm6 ;DC
 %ifdef UNIX64
-	push r4
+    push r4
 %endif
-	mov         r0,    0
-	mov         r4,    0
+    mov         r0,    0
+    mov         r4,    0
 
 .loop16x16_get_satd:
 .loopStart1:
-	SSE41_I16x16GetX38x4Satd r0, r4
-	inc          r0
-	cmp         r0, 4
-	jl          .loopStart1
-	cmp         r4, 16
-	je          .loop16x16_get_satd_end
+    SSE41_I16x16GetX38x4Satd r0, r4
+    inc          r0
+    cmp         r0, 4
+    jl          .loopStart1
+    cmp         r4, 16
+    je          .loop16x16_get_satd_end
 %ifdef X86_32
-	mov r2, arg3
+    mov r2, arg3
 %else
-	mov r2, r12
+    mov r2, r12
 %endif
-	add         r2, 8
-	mov         r0, 0
-	add         r4, 16
-	jmp         .loop16x16_get_satd
+    add         r2, 8
+    mov         r0, 0
+    add         r4, 16
+    jmp         .loop16x16_get_satd
  .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
+    MMX_DW_1_2REG    xmm0, xmm1
+    psrlw       xmm4, 1 ;/2
+    psrlw       xmm5, 1 ;/2
+    psrlw       xmm6, 1 ;/2
+    SSE41_HSum8W     xmm4, xmm0, xmm1
+    SSE41_HSum8W     xmm5, xmm0, xmm1
+    SSE41_HSum8W     xmm6, xmm0, xmm1
 
 %ifdef UNIX64
-	pop r4
+    pop r4
 %endif
-	; comparing order: DC H V
-	movd      r3d, xmm6 ;DC
-	movd      r1d, xmm5 ;H
-	movd      r0d, xmm4 ;V
+    ; comparing order: DC H V
+    movd      r3d, xmm6 ;DC
+    movd      r1d, xmm5 ;H
+    movd      r0d, xmm4 ;V
 %ifndef X86_32
-	pop r12
+    pop r12
 %endif
-	shl       r5d, 1
-	add       r1d, r5d
-	add       r3d, r5d
-	mov       r4, arg5
-	cmp       r3d, r1d
-	jge near   not_dc_16x16
-	cmp        r3d, r0d
-	jge near   not_dc_h_16x16
+    shl       r5d, 1
+    add       r1d, r5d
+    add       r3d, r5d
+    mov       r4, arg5
+    cmp       r3d, r1d
+    jge near   not_dc_16x16
+    cmp        r3d, r0d
+    jge near   not_dc_h_16x16
 
-	; for DC mode
-	mov       dword[r4], 2;I16_PRED_DC
-	mov       retrd, r3d
-	jmp near return_satd_intra_16x16_x3
+    ; for DC mode
+    mov       dword[r4], 2;I16_PRED_DC
+    mov       retrd, r3d
+    jmp near return_satd_intra_16x16_x3
 not_dc_16x16:
-	; for H mode
-	cmp       r1d, r0d
-	jge near   not_dc_h_16x16
-	mov       dword[r4], 1;I16_PRED_H
-	mov       retrd, r1d
-	jmp near return_satd_intra_16x16_x3
+    ; for H mode
+    cmp       r1d, r0d
+    jge near   not_dc_h_16x16
+    mov       dword[r4], 1;I16_PRED_H
+    mov       retrd, r1d
+    jmp near return_satd_intra_16x16_x3
 not_dc_h_16x16:
-	; for V mode
-	mov       dword[r4], 0;I16_PRED_V
-	mov       retrd, r0d
+    ; for V mode
+    mov       dword[r4], 0;I16_PRED_V
+    mov       retrd, r0d
 return_satd_intra_16x16_x3:
-	WELSEMMS
-	POP_XMM
-	LOAD_7_PARA_POP
+    WELSEMMS
+    POP_XMM
+    LOAD_7_PARA_POP
 ret
 
 %macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         r0,    r1
-	movq		xmm0,   [r0]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [r6],  xmm0 ;V
-	add         r0,    r1
-	pinsrb      xmm0,   byte[r0-1], 0
-	pinsrb      xmm0,   byte[r0+r1-1], 1
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     2
-	pinsrb      xmm0,   byte[r0+r1-1], 3
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     4
-	pinsrb      xmm0,   byte[r0+r1-1], 5
-	lea         r0,    [r0+2*r1]
-	pinsrb      xmm0,   byte[r0-1],     6
-	pinsrb      xmm0,   byte[r0+r1-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [r6+16], xmm0 ;H
+    movdqa      xmm5,   [HSumSubDB1]
+    movdqa      xmm6,   [HSumSubDW1]
+    movdqa      xmm7,   [PDW1]
+    sub         r0,    r1
+    movq        xmm0,   [r0]
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+    movdqa      [r6],  xmm0 ;V
+    add         r0,    r1
+    pinsrb      xmm0,   byte[r0-1], 0
+    pinsrb      xmm0,   byte[r0+r1-1], 1
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     2
+    pinsrb      xmm0,   byte[r0+r1-1], 3
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     4
+    pinsrb      xmm0,   byte[r0+r1-1], 5
+    lea         r0,    [r0+2*r1]
+    pinsrb      xmm0,   byte[r0-1],     6
+    pinsrb      xmm0,   byte[r0+r1-1], 7
+    punpcklqdq  xmm0,   xmm0
+    SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+    movdqa      [r6+16], xmm0 ;H
 ;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
+    movdqa      xmm6,   [PDQ2]
+    movdqa      xmm5,   xmm4
+    punpckhqdq  xmm5,   xmm1
+    paddd       xmm5,   xmm6
+    psrld       xmm5,   2
 ;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
+    paddd       xmm6,   xmm6
+    paddd       xmm4,   xmm1
+    paddd       xmm4,   xmm6
+    psrld       xmm4,   3
 ;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
+    pslld       xmm5,   4
+    pslld       xmm4,   4
 ;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [r6+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [r6+48], xmm5
+    movdqa      xmm6,   xmm4
+    punpcklqdq  xmm4,   xmm5
+    psllq       xmm4,   32
+    psrlq       xmm4,   32
+    movdqa      [r6+32], xmm4
+    punpckhqdq  xmm5,   xmm6
+    psllq       xmm5,   32
+    psrlq       xmm5,   32
+    movdqa      [r6+48], xmm5
 
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         r0,    0
-	SSE41_ChromaGetX38x4Satd r0, 0
-	inc             r0
-	SSE41_ChromaGetX38x4Satd r0, 0
+    pxor        xmm4,   xmm4 ;V
+    pxor        xmm5,   xmm5 ;H
+    pxor        xmm6,   xmm6 ;DC
+    mov         r0,    0
+    SSE41_ChromaGetX38x4Satd r0, 0
+    inc             r0
+    SSE41_ChromaGetX38x4Satd r0, 0
 %endmacro
 
 %macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
+    movdq2q     %2, %1
+    movhlps     %1, %1
+    movdq2q     %3, %1
 %endmacro
 %macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
+    movq2dq     %1, %3
+    movq2dq     %2, %4
+    punpcklqdq  %1, %2
 %endmacro
 ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
 
 WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-	%assign  push_num 0
-	LOAD_7_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	SIGN_EXTENSION r5, r5d
+    %assign  push_num 0
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
 loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov r0,     arg8
-	mov r2,     arg9
+    SSE41_ChromaGetX38x8Satd
+    SSEReg2MMX  xmm4, mm0,mm1
+    SSEReg2MMX  xmm5, mm2,mm3
+    SSEReg2MMX  xmm6, mm5,mm6
+    mov r0,     arg8
+    mov r2,     arg9
 
-	SSE41_ChromaGetX38x8Satd
+    SSE41_ChromaGetX38x8Satd
 
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+    MMXReg2SSE  xmm0, xmm3, mm0, mm1
+    MMXReg2SSE  xmm1, xmm3, mm2, mm3
+    MMXReg2SSE  xmm2, xmm3, mm5, mm6
 
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
+    paddw       xmm4, xmm0
+    paddw       xmm5, xmm1
+    paddw       xmm6, xmm2
 
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      r3d, xmm6 ;DC
-	movd      r1d, xmm5 ;H
-	movd      r0d, xmm4 ;V
+    MMX_DW_1_2REG    xmm0, xmm1
+    psrlw       xmm4, 1 ;/2
+    psrlw       xmm5, 1 ;/2
+    psrlw       xmm6, 1 ;/2
+    SSE41_HSum8W     xmm4, xmm0, xmm1
+    SSE41_HSum8W     xmm5, xmm0, xmm1
+    SSE41_HSum8W     xmm6, xmm0, xmm1
+    ; comparing order: DC H V
+    movd      r3d, xmm6 ;DC
+    movd      r1d, xmm5 ;H
+    movd      r0d, xmm4 ;V
 
 
-	shl       r5d, 1
-	add       r1d, r5d
-	add       r0d, r5d
-	cmp       r3d, r1d
-	jge near   not_dc_8x8
-	cmp        r3d, r0d
-	jge near   not_dc_h_8x8
+    shl       r5d, 1
+    add       r1d, r5d
+    add       r0d, r5d
+    cmp       r3d, r1d
+    jge near   not_dc_8x8
+    cmp        r3d, r0d
+    jge near   not_dc_h_8x8
 
-	; for DC mode
-	mov       dword[r4], 0;I8_PRED_DC
-	mov       retrd, r3d
-	jmp near return_satd_intra_8x8_x3
+    ; for DC mode
+    mov       dword[r4], 0;I8_PRED_DC
+    mov       retrd, r3d
+    jmp near return_satd_intra_8x8_x3
 not_dc_8x8:
-	; for H mode
-	cmp       r1d, r0d
-	jge near   not_dc_h_8x8
-	mov       dword[r4], 1;I8_PRED_H
-	mov       retrd, r1d
-	jmp near return_satd_intra_8x8_x3
+    ; for H mode
+    cmp       r1d, r0d
+    jge near   not_dc_h_8x8
+    mov       dword[r4], 1;I8_PRED_H
+    mov       retrd, r1d
+    jmp near return_satd_intra_8x8_x3
 not_dc_h_8x8:
-	; for V mode
-	mov       dword[r4], 2;I8_PRED_V
-	mov       retrd, r0d
+    ; for V mode
+    mov       dword[r4], 2;I8_PRED_V
+    mov       retrd, r0d
 return_satd_intra_8x8_x3:
-	WELSEMMS
-	POP_XMM
-	LOAD_7_PARA_POP
+    WELSEMMS
+    POP_XMM
+    LOAD_7_PARA_POP
 ret
 
 
@@ -1040,22 +1040,22 @@
 ;
 ;***********************************************************************
 %macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
+    movd        xmm6,%1
+    pshufb      xmm6,xmm1
+    movdqa      %1,  xmm6
+    movdqa      xmm0,%2
+    psadbw      xmm0,xmm7
+    paddw       xmm4,xmm0
+    movdqa      xmm0,%2
+    psadbw      xmm0,xmm5
+    paddw       xmm2,xmm0
+    psadbw      xmm6,%2
+    paddw       xmm3,xmm6
 %endmacro
 %macro WelsAddDCValue 4
-  movzx   %2, byte %1
-  mov    %3, %2
-  add     %4, %2
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
 %endmacro
 
 ;***********************************************************************
@@ -1064,138 +1064,138 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-	%assign  push_num 0
-	LOAD_7_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	SIGN_EXTENSION r5, r5d
+    %assign  push_num 0
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r5, r5d
 
-	push  r5
-	push  r4
-	push  r3
+    push  r5
+    push  r4
+    push  r3
 
-	sub    r0,    r1
-	movdqa      xmm5,[r0]
-	pxor        xmm0,xmm0
-	psadbw      xmm0,xmm5
-	movhlps     xmm1,xmm0
-	paddw       xmm0,xmm1
-	movd        r5d, xmm0
+    sub    r0,    r1
+    movdqa      xmm5,[r0]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        r5d, xmm0
 
-	add         r0,r1
-	lea         r3,[r1+2*r1]    ;ebx r3
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	lea         r0, [r0+4*r1]
-	add         r6, 64
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	lea         r0, [r0+4*r1]
-	add         r6, 64
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	lea         r0, [r0+4*r1]
-	add         r6, 64
-	WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
-	WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
-	WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
-	WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
-	sub         r6, 192
-	add         r5d,10h
-	shr         r5d,5
-	movd        xmm7,r5d
-	pxor        xmm1,xmm1
-	pshufb      xmm7,xmm1
-	pxor        xmm4,xmm4
-	pxor        xmm3,xmm3
-	pxor        xmm2,xmm2
-	;sad begin
-	pop   r3
-	lea         r4, [r3+2*r3] ;esi r4
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
-	add         r6, 64
-	lea         r2, [r2+4*r3]
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
-	add         r6, 64
-	lea         r2, [r2+4*r3]
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
-	add         r6, 64
-	lea         r2, [r2+4*r3]
-	SSSE3_Get16BSadHVDC [r6], [r2]
-	SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
-	SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
-	SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r0,r1
+    lea         r3,[r1+2*r1]    ;ebx r3
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d    ; esi r4d, eax r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    lea         r0, [r0+4*r1]
+    add         r6, 64
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    lea         r0, [r0+4*r1]
+    add         r6, 64
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    lea         r0, [r0+4*r1]
+    add         r6, 64
+    WelsAddDCValue [r0-1     ], r4d, [r6   ], r5d
+    WelsAddDCValue [r0-1+r1  ], r4d, [r6+16], r5d
+    WelsAddDCValue [r0-1+r1*2], r4d, [r6+32], r5d
+    WelsAddDCValue [r0-1+r3  ], r4d, [r6+48], r5d
+    sub         r6, 192
+    add         r5d,10h
+    shr         r5d,5
+    movd        xmm7,r5d
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+    ;sad begin
+    pop   r3
+    lea         r4, [r3+2*r3] ;esi r4
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r6, 64
+    lea         r2, [r2+4*r3]
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r6, 64
+    lea         r2, [r2+4*r3]
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
+    add         r6, 64
+    lea         r2, [r2+4*r3]
+    SSSE3_Get16BSadHVDC [r6], [r2]
+    SSSE3_Get16BSadHVDC [r6+16], [r2+r3]
+    SSSE3_Get16BSadHVDC [r6+32], [r2+2*r3]
+    SSSE3_Get16BSadHVDC [r6+48], [r2+r4]
 
-	pop r4
-	pop r5
-	pslldq      xmm3,4
-	por         xmm3,xmm2
-	movhlps     xmm1,xmm3
-	paddw       xmm3,xmm1
-	movhlps     xmm0,xmm4
-	paddw       xmm4,xmm0
-	; comparing order: DC H V
-	movd        r1d, xmm4 ;DC   ;ebx r1d
-	movd        r0d, xmm3 ;V    ;ecx r0d
-	psrldq      xmm3, 4
-	movd        r2d, xmm3 ;H    ;esi r2d
+    pop r4
+    pop r5
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+    ; comparing order: DC H V
+    movd        r1d, xmm4 ;DC   ;ebx r1d
+    movd        r0d, xmm3 ;V    ;ecx r0d
+    psrldq      xmm3, 4
+    movd        r2d, xmm3 ;H    ;esi r2d
 
-	;mov         eax, [esp+36] ;lamda ;eax r5
-	shl         r5d, 1
-	add         r2d, r5d
-	add         r1d, r5d
-	;mov         edx, [esp+32]  ;edx r4
-	cmp         r1d, r2d
-	jge near   not_dc_16x16_sad
-	cmp        r1d, r0d
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[r4], 2;I16_PRED_DC
-	mov       retrd, r1d
-	sub        r6, 192
+    ;mov         eax, [esp+36] ;lamda ;eax r5
+    shl         r5d, 1
+    add         r2d, r5d
+    add         r1d, r5d
+    ;mov         edx, [esp+32]  ;edx r4
+    cmp         r1d, r2d
+    jge near   not_dc_16x16_sad
+    cmp        r1d, r0d
+    jge near   not_dc_h_16x16_sad
+    ; for DC mode
+    mov       dword[r4], 2;I16_PRED_DC
+    mov       retrd, r1d
+    sub        r6, 192
 %assign x 0
 %rep 16
-	movdqa    [r6+16*x], xmm7
+    movdqa    [r6+16*x], xmm7
 %assign x x+1
 %endrep
-	jmp near return_sad_intra_16x16_x3
+    jmp near return_sad_intra_16x16_x3
 not_dc_16x16_sad:
-	; for H mode
-	cmp       r2d, r0d
-	jge near   not_dc_h_16x16_sad
-	mov       dword[r4], 1;I16_PRED_H
-	mov       retrd, r2d
-	jmp near return_sad_intra_16x16_x3
+    ; for H mode
+    cmp       r2d, r0d
+    jge near   not_dc_h_16x16_sad
+    mov       dword[r4], 1;I16_PRED_H
+    mov       retrd, r2d
+    jmp near return_sad_intra_16x16_x3
 not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[r4], 0;I16_PRED_V
-	mov       retrd, r0d
-	sub       r6, 192
+    ; for V mode
+    mov       dword[r4], 0;I16_PRED_V
+    mov       retrd, r0d
+    sub       r6, 192
 %assign x 0
 %rep 16
-	movdqa    [r6+16*x], xmm5
+    movdqa    [r6+16*x], xmm5
 %assign x x+1
 %endrep
 return_sad_intra_16x16_x3:
-	POP_XMM
-	LOAD_7_PARA_POP
-	ret
+    POP_XMM
+    LOAD_7_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -1210,63 +1210,63 @@
 
 ;SSE4.1
 %macro SSE41_GetSatd8x4 0
-	movq             xmm0, [r0]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [r0+r1]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [r2]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r2+r3]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [r0+2*r1]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r0+r4]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [r2+2*r3]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [r2+r5]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
+    movq             xmm0, [r0]
+    punpcklqdq       xmm0, xmm0
+    pmaddubsw        xmm0, xmm7
+    movq             xmm1, [r0+r1]
+    punpcklqdq       xmm1, xmm1
+    pmaddubsw        xmm1, xmm7
+    movq             xmm2, [r2]
+    punpcklqdq       xmm2, xmm2
+    pmaddubsw        xmm2, xmm7
+    movq             xmm3, [r2+r3]
+    punpcklqdq       xmm3, xmm3
+    pmaddubsw        xmm3, xmm7
+    psubsw           xmm0, xmm2
+    psubsw           xmm1, xmm3
+    movq             xmm2, [r0+2*r1]
+    punpcklqdq       xmm2, xmm2
+    pmaddubsw        xmm2, xmm7
+    movq             xmm3, [r0+r4]
+    punpcklqdq       xmm3, xmm3
+    pmaddubsw        xmm3, xmm7
+    movq             xmm4, [r2+2*r3]
+    punpcklqdq       xmm4, xmm4
+    pmaddubsw        xmm4, xmm7
+    movq             xmm5, [r2+r5]
+    punpcklqdq       xmm5, xmm5
+    pmaddubsw        xmm5, xmm7
+    psubsw           xmm2, xmm4
+    psubsw           xmm3, xmm5
+    SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+    pabsw            xmm0, xmm0
+    pabsw            xmm2, xmm2
+    pabsw            xmm1, xmm1
+    pabsw            xmm3, xmm3
+    movdqa           xmm4, xmm3
+    pblendw          xmm3, xmm1, 0xAA
+    pslld            xmm1, 16
+    psrld            xmm4, 16
+    por              xmm1, xmm4
+    pmaxuw           xmm1, xmm3
+    paddw            xmm6, xmm1
+    movdqa           xmm4, xmm0
+    pblendw          xmm0, xmm2, 0xAA
+    pslld            xmm2, 16
+    psrld            xmm4, 16
+    por              xmm2, xmm4
+    pmaxuw           xmm0, xmm2
+    paddw            xmm6, xmm0
 %endmacro
 
 %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
+    MMX_DW_1_2REG    %3, %4
+    pmaddwd     %2, %3
+    movhlps     %4, %2
+    paddd       %2, %4
+    pshuflw     %4, %2,0Eh
+    paddd       %2, %4
+    movd        %1, %2
 %endmacro
 ;***********************************************************************
 ;
@@ -1274,53 +1274,53 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[r2+r3*2]
-	lea         r2, [r3*2+r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[r0+r1*2]
-	lea         r0, [r1*2+r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqa      xmm4,[HSwapSumSubDB1]
+    movd        xmm2,[r2]
+    movd        xmm5,[r2+r3]
+    shufps      xmm2,xmm5,0
+    movd        xmm3,[r2+r3*2]
+    lea         r2, [r3*2+r2]
+    movd        xmm5,[r2+r3]
+    shufps      xmm3,xmm5,0
+    movd        xmm0,[r0]
+    movd        xmm5,[r0+r1]
+    shufps      xmm0,xmm5,0
+    movd        xmm1,[r0+r1*2]
+    lea         r0, [r1*2+r0]
+    movd        xmm5,[r0+r1]
+    shufps      xmm1,xmm5,0
+    pmaddubsw   xmm0,xmm4
+    pmaddubsw   xmm1,xmm4
+    pmaddubsw   xmm2,xmm4
+    pmaddubsw   xmm3,xmm4
+    psubw       xmm0,xmm2
+    psubw       xmm1,xmm3
+    movdqa      xmm2,xmm0
+    paddw       xmm0,xmm1
+    psubw       xmm1,xmm2
+    movdqa      xmm2,xmm0
+    punpcklqdq  xmm0,xmm1
+    punpckhqdq  xmm2,xmm1
+    movdqa      xmm1,xmm0
+    paddw       xmm0,xmm2
+    psubw       xmm2,xmm1
+    movdqa      xmm1,xmm0
+    pblendw     xmm0,xmm2,0AAh
+    pslld       xmm2,16
+    psrld       xmm1,16
+    por         xmm2,xmm1
+    pabsw       xmm0,xmm0
+    pabsw       xmm2,xmm2
+    pmaxsw      xmm0,xmm2
+    SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -1329,30 +1329,30 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x8_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
+    push  r4
+    push  r5
 %endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			r0,	 [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    %assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6, xmm6
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    SSE41_GetSatd8x4
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r5
-	pop  r4
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1361,36 +1361,36 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd8x16_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
+    push  r4
+    push  r5
+    push  r6
 %endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor        xmm6, xmm6
-	mov         r6,    0
+    %assign  push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6, xmm6
+    mov         r6,    0
 loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    inc         r6
+    cmp         r6,  4
+    jl          loop_get_satd_8x16
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
+    pop  r6
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1399,42 +1399,42 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd16x8_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
+    push  r4
+    push  r5
 %endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push  r0
-	push  r2
+    %assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    push  r0
+    push  r2
 
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6,   xmm6
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    SSE41_GetSatd8x4
 
-	pop  r2
-	pop  r0
-	add			r0,    8
-	add			r2,    8
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    pop  r2
+    pop  r0
+    add         r0,    8
+    add         r2,    8
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    SSE41_GetSatd8x4
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r5
-	pop  r4
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1444,53 +1444,53 @@
 
 WELS_EXTERN WelsSampleSatd16x16_sse41
 %ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
+    push  r4
+    push  r5
+    push  r6
 %endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
+    %assign  push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
 
-	push  r0
-	push  r2
+    push  r0
+    push  r2
 
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	mov         r6,    0
+    movdqa      xmm7, [HSumSubDB1]
+    lea         r4,  [r1+r1*2]
+    lea         r5,  [r3+r3*2]
+    pxor        xmm6,   xmm6
+    mov         r6,    0
 loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_left
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    inc         r6
+    cmp         r6,  4
+    jl          loop_get_satd_16x16_left
 
-	pop  r2
-	pop  r0
-	add			r0,    8
-	add			r2,    8
-	mov         r6,    0
+    pop  r2
+    pop  r0
+    add         r0,    8
+    add         r2,    8
+    mov         r6,    0
 loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
+    SSE41_GetSatd8x4
+    lea         r0,  [r0+4*r1]
+    lea         r2,  [r2+4*r3]
+    inc         r6
+    cmp         r6,  4
+    jl          loop_get_satd_16x16_right
+    SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
+    pop  r6
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1505,55 +1505,55 @@
 ;***********************************************************************
 
 %macro SSE2_GetSad2x16 0
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqu xmm1,   [r2]
-	MOVDQ  xmm2,   [r0];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqu xmm1,   [r2]
+    MOVDQ  xmm2,   [r0];[eax] must aligned 16
+    psadbw xmm1,   xmm2
+    paddw  xmm0,   xmm1
+    movdqu xmm1,   [r2+r3]
+    MOVDQ  xmm2,   [r0+r1]
+    psadbw xmm1,   xmm2
+    paddw  xmm0,   xmm1
 %endmacro
 
 
 %macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+2*r3]
-	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+r5]
-	MOVDQ  xmm2,   [r0+r4]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
+    movdqu xmm0,   [r2]
+    MOVDQ  xmm2,   [r0]
+    psadbw xmm0,   xmm2
+    paddw  xmm7,   xmm0
+    movdqu xmm1,   [r2+r3]
+    MOVDQ  xmm2,   [r0+r1]
+    psadbw xmm1,   xmm2
+    paddw  xmm7,   xmm1
+    movdqu xmm1,   [r2+2*r3]
+    MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+    psadbw xmm1,   xmm2
+    paddw  xmm7,   xmm1
+    movdqu xmm1,   [r2+r5]
+    MOVDQ  xmm2,   [r0+r4]
+    psadbw xmm1,   xmm2
+    paddw  xmm7,   xmm1
 %endmacro
 
 
 %macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
+    movq   xmm0,   [r0]
+    movq   xmm1,   [r0+r1]
+    lea    r0,     [r0+2*r1]
+    movhps xmm0,   [r0]
+    movhps xmm1,   [r0+r1]
 
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
+    movq   xmm2,   [r2]
+    movq   xmm3,   [r2+r3]
+    lea    r2,     [r2+2*r3]
+    movhps xmm2,   [r2]
+    movhps xmm3,   [r2+r3]
+    psadbw xmm0,   xmm2
+    psadbw xmm1,   xmm3
+    paddw  xmm6,   xmm0
+    paddw  xmm6,   xmm1
 %endmacro
 
 ;***********************************************************************
@@ -1565,39 +1565,39 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad16x16_sse2
 %ifdef X86_32
-	push  r4
-	push  r5
+    push  r4
+    push  r5
 %endif
 
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	lea r4, [3*r1]
-	lea r5, [3*r3]
+    %assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    lea r4, [3*r1]
+    lea r5, [3*r3]
 
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd retrd, xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
+    pxor   xmm7,   xmm7
+    SSE2_GetSad4x16
+    lea    r0,  [r0+4*r1]
+    lea    r2,  [r2+4*r3]
+    SSE2_GetSad4x16
+    lea    r0,  [r0+4*r1]
+    lea    r2,  [r2+4*r3]
+    SSE2_GetSad4x16
+    lea    r0,  [r0+4*r1]
+    lea    r2,  [r2+4*r3]
+    SSE2_GetSad4x16
+    movhlps xmm0, xmm7
+    paddw xmm0, xmm7
+    movd retrd, xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
 %ifdef X86_32
-	pop  r5
-	pop  r4
+    pop  r5
+    pop  r4
 %endif
-	ret
+    ret
 
 ;***********************************************************************
 ;
@@ -1607,55 +1607,55 @@
 ;
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad16x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
+    %assign  push_num 0
+    LOAD_4_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movdqu xmm0,   [r2]
+    MOVDQ  xmm2,   [r0]
+    psadbw xmm0,   xmm2
+    movdqu xmm1,   [r2+r3]
+    MOVDQ  xmm2,   [r0+r1]
+    psadbw xmm1,   xmm2
+    paddw  xmm0,   xmm1
 
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
+    SSE2_GetSad2x16
+    SSE2_GetSad2x16
+    SSE2_GetSad2x16
 
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
+    movhlps     xmm1, xmm0
+    paddw       xmm0, xmm1
+    movd        retrd,  xmm0
+    LOAD_4_PARA_POP
+    ret
 
 
 
 WELS_EXTERN WelsSampleSad8x16_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 7
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
     pxor   xmm6,   xmm6
 
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
     SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
     SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
 
     movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
+    paddw      xmm0, xmm6
+    movd       retrd,  xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
 
 %macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
@@ -1664,22 +1664,22 @@
 %endmacro
 
 WELS_EXTERN WelsSampleSad8x8_sse21
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
+    %assign  push_num 0
+    mov     r2,  arg3
+    push    r2
+    CACHE_SPLIT_CHECK r2, 8, 64
+    jle    near   .pixel_sad_8x8_nsplit
+    pop     r2
 %ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
+    push    r3
+    push    r4
+    push    r5
 %endif
-	%assign  push_num 3
-	PUSH_XMM 8
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENSION r1, r1d
+    %assign  push_num 3
+    PUSH_XMM 8
+    mov     r0,  arg1
+    mov     r1,  arg2
+    SIGN_EXTENSION r1, r1d
     pxor   xmm7,   xmm7
 
     ;ecx r2, edx r4, edi r5
@@ -1694,109 +1694,109 @@
     shl    r4,    3
     movd   xmm5,   r5d
     movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
+    mov    r5,    8
+    add    r5,    r2
     mov    r3,    arg4
-	SIGN_EXTENSION r3, r3d
+    SIGN_EXTENSION r3, r3d
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    lea    r5,    [r5+2*r3]
 
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    lea    r5,    [r5+2*r3]
 
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    lea    r5,    [r5+2*r3]
 
     movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
+    movhps xmm0,   [r0+r1]
 
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
+    movq   xmm1,   [r2]
+    movq   xmm2,   [r5]
+    movhps xmm1,   [r2+r3]
+    movhps xmm2,   [r5+r3]
+    psrlq  xmm1,   xmm5
+    psllq  xmm2,   xmm6
+    por    xmm1,   xmm2
 
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
+    psadbw xmm0,   xmm1
+    paddw  xmm7,   xmm0
 
     movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-	POP_XMM
+    paddw      xmm0, xmm7
+    movd       retrd,  xmm0
+    POP_XMM
 %ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
+    pop  r5
+    pop  r4
+    pop  r3
 %endif
-	jmp        .return
+    jmp        .return
 
 .pixel_sad_8x8_nsplit:
 
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 7
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
+    pop r2
+    %assign  push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 7
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
+    SSE2_GetSad8x4
     lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
+    lea    r2,    [r2+2*r3]
     SSE2_GetSad8x4
     movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
+    paddw      xmm0, xmm6
+    movd       retrd,  xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
 .return:
-	ret
+    ret
 
 
 ;***********************************************************************
@@ -1814,624 +1814,624 @@
 
 
 %macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
+    psadbw %1,   %4
+    paddw  xmm5, %1
+    psadbw %4,   %3
+    paddw  xmm4, %4
+    movdqu %4,   [%5-1]
+    psadbw %4,   %2
+    paddw  xmm6, %4
+    movdqu %4,   [%5+1]
+    psadbw %4,   %2
+    paddw  xmm7, %4
 %endmacro
 WELS_EXTERN WelsSampleSadFour16x16_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movdqa xmm0,   [r0]
+    sub    r2,    r3
+    movdqu xmm3,   [r2]
+    psadbw xmm3,   xmm0
+    paddw  xmm4,   xmm3
 
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm3,   xmm1
+    paddw  xmm4,   xmm3
 
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
+    movdqu xmm2,   [r2+r3-1]
+    psadbw xmm2,   xmm0
+    paddw  xmm6,   xmm2
 
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
+    movdqu xmm3,   [r2+r3+1]
+    psadbw xmm3,   xmm0
+    paddw  xmm7,   xmm3
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm1,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+    movdqa xmm2,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm0,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm1,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+    movdqa xmm2,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm0,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r2,    [r2+2*r3]
+    movdqu xmm3,   [r2]
+    psadbw xmm2,   xmm3
+    paddw xmm5,   xmm2
 
-	movdqu xmm2,   [r2-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
+    movdqu xmm2,   [r2-1]
+    psadbw xmm2,   xmm0
+    paddw xmm6,   xmm2
 
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
+    movdqu xmm3,   [r2+1]
+    psadbw xmm3,   xmm0
+    paddw xmm7,   xmm3
 
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm0,   xmm3
+    paddw xmm5,   xmm0
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 
 WELS_EXTERN WelsSampleSadFour16x8_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movdqa xmm0,   [r0]
+    sub    r2,    r3
+    movdqu xmm3,   [r2]
+    psadbw xmm3,   xmm0
+    paddw xmm4,   xmm3
 
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm3,   xmm1
+    paddw xmm4,   xmm3
 
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
+    movdqu xmm2,   [r2+r3-1]
+    psadbw xmm2,   xmm0
+    paddw xmm6,   xmm2
 
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
+    movdqu xmm3,   [r2+r3+1]
+    psadbw xmm3,   xmm0
+    paddw xmm7,   xmm3
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm2,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+    movdqa xmm0,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm1,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+    movdqa xmm2,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movdqa xmm0,   [r0]
+    movdqu xmm3,   [r2]
+    SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+    movdqa xmm1,   [r0+r1]
+    movdqu xmm3,   [r2+r3]
+    SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+    lea    r2,    [r2+2*r3]
+    movdqu xmm3,   [r2]
+    psadbw xmm0,   xmm3
+    paddw xmm5,   xmm0
 
-	movdqu xmm0,   [r2-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
+    movdqu xmm0,   [r2-1]
+    psadbw xmm0,   xmm1
+    paddw xmm6,   xmm0
 
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
+    movdqu xmm3,   [r2+1]
+    psadbw xmm3,   xmm1
+    paddw xmm7,   xmm3
 
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
+    movdqu xmm3,   [r2+r3]
+    psadbw xmm1,   xmm3
+    paddw xmm5,   xmm1
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 WELS_EXTERN WelsSampleSadFour8x16_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movq   xmm0,   [r0]
+    movhps xmm0,   [r0+r1]
+    sub    r2,    r3
+    movq   xmm3,   [r2]
+    movhps xmm3,   [r2+r3]
+    psadbw xmm3,   xmm0
+    paddw  xmm4,   xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 
 WELS_EXTERN WelsSampleSadFour8x8_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+    pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+    pxor   xmm6,   xmm6    ;sad pRefMb-1
+    pxor   xmm7,   xmm7    ;sad pRefMb+1
+    movq   xmm0,   [r0]
+    movhps xmm0,   [r0+r1]
+    sub    r2,    r3
+    movq   xmm3,   [r2]
+    movhps xmm3,   [r2+r3]
+    psadbw xmm3,   xmm0
+    paddw  xmm4,   xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
+    movq   xmm0,  [r0]
+    movhps xmm0,  [r0+r1]
+    psadbw xmm3,  xmm0
+    paddw  xmm4,  xmm3
 
 
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
+    movq   xmm1,  [r2+r3-1]
+    movq   xmm3,  [r2+r3+1]
 
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
+    lea    r0,    [r0+2*r1]
+    lea    r2,    [r2+2*r3]
+    movhps xmm1,  [r2-1]
+    movhps xmm3,  [r2+1]
 
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
+    psadbw xmm1,  xmm0
+    paddw  xmm6,  xmm1
+    psadbw xmm3,  xmm0
+    paddw  xmm7,  xmm3
 
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
+    movq   xmm3,  [r2]
+    movhps xmm3,  [r2+r3]
+    psadbw xmm0,  xmm3
+    paddw  xmm5,  xmm0
 
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    movhlps    xmm0, xmm5
+    paddw      xmm5, xmm0
+    movhlps    xmm0, xmm6
+    paddw      xmm6, xmm0
+    movhlps    xmm0, xmm7
+    paddw      xmm7, xmm0
+    punpckldq  xmm4, xmm5
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6
+    movdqa     [r4],xmm4
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 WELS_EXTERN WelsSampleSadFour4x4_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movd   xmm0,   [r0]
-	movd   xmm1,   [r0+r1]
-	lea        r0,    [r0+2*r1]
-	movd       xmm2,   [r0]
-	movd       xmm3,   [r0+r1]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        r2,  r3
-	movd       xmm1, [r2]
-	movd       xmm2, [r2+r3]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [r2+r3-1]
-	movd       xmm3, [r2+r3+1]
+    %assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movd   xmm0,   [r0]
+    movd   xmm1,   [r0+r1]
+    lea        r0,    [r0+2*r1]
+    movd       xmm2,   [r0]
+    movd       xmm3,   [r0+r1]
+    punpckldq  xmm0, xmm1
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm0, xmm2
+    sub        r2,  r3
+    movd       xmm1, [r2]
+    movd       xmm2, [r2+r3]
+    punpckldq  xmm1, xmm2
+    movd       xmm2, [r2+r3-1]
+    movd       xmm3, [r2+r3+1]
 
-	lea        r2,  [r2+2*r3]
+    lea        r2,  [r2+2*r3]
 
-	movd       xmm4, [r2]
-	movd       xmm5, [r2-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [r2+1]
-	punpckldq  xmm3, xmm5
+    movd       xmm4, [r2]
+    movd       xmm5, [r2-1]
+    punpckldq  xmm2, xmm5
+    movd       xmm5, [r2+1]
+    punpckldq  xmm3, xmm5
 
-	movd       xmm5, [r2+r3]
-	punpckldq  xmm4, xmm5
+    movd       xmm5, [r2+r3]
+    punpckldq  xmm4, xmm5
 
-	punpcklqdq xmm1, xmm4 ;-L
+    punpcklqdq xmm1, xmm4 ;-L
 
-	movd       xmm5, [r2+r3-1]
-	movd       xmm6, [r2+r3+1]
+    movd       xmm5, [r2+r3-1]
+    movd       xmm6, [r2+r3+1]
 
-	lea        r2,  [r2+2*r3]
-	movd       xmm7, [r2-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [r2+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [r2]
-	movd       xmm7, [r2+r3]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
+    lea        r2,  [r2+2*r3]
+    movd       xmm7, [r2-1]
+    punpckldq  xmm5, xmm7
+    punpcklqdq xmm2, xmm5 ;-1
+    movd       xmm7, [r2+1]
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm3, xmm6 ;+1
+    movd       xmm6, [r2]
+    movd       xmm7, [r2+r3]
+    punpckldq  xmm6, xmm7
+    punpcklqdq xmm4, xmm6 ;+L
+    psadbw     xmm1, xmm0
+    psadbw     xmm2, xmm0
+    psadbw     xmm3, xmm0
+    psadbw     xmm4, xmm0
 
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [r4],xmm1
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
+    movhlps    xmm0, xmm1
+    paddw      xmm1, xmm0
+    movhlps    xmm0, xmm2
+    paddw      xmm2, xmm0
+    movhlps    xmm0, xmm3
+    paddw      xmm3, xmm0
+    movhlps    xmm0, xmm4
+    paddw      xmm4, xmm0
+    punpckldq  xmm1, xmm4
+    punpckldq  xmm2, xmm3
+    punpcklqdq xmm1, xmm2
+    movdqa     [r4],xmm1
+    POP_XMM
+    LOAD_5_PARA_POP
+    ret
 
 ;***********************************************************************
 ;
@@ -2444,33 +2444,33 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSad4x4_mmx
     %assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movd	  mm0, [r0]
-	movd	  mm1, [r0+r1]
-	punpckldq mm0, mm1
+    LOAD_4_PARA
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    movd      mm0, [r0]
+    movd      mm1, [r0+r1]
+    punpckldq mm0, mm1
 
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
+    movd      mm3, [r2]
+    movd      mm4, [r2+r3]
+    punpckldq mm3, mm4
+    psadbw    mm0, mm3
 
-	lea       r0, [r0+2*r1]
-	lea       r2, [r2+2*r3]
+    lea       r0, [r0+2*r1]
+    lea       r2, [r2+2*r3]
 
-	movd      mm1, [r0]
-	movd      mm2, [r0+r1]
-	punpckldq mm1, mm2
+    movd      mm1, [r0]
+    movd      mm2, [r0+r1]
+    punpckldq mm1, mm2
 
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
+    movd      mm3, [r2]
+    movd      mm4, [r2+r3]
+    punpckldq mm3, mm4
+    psadbw    mm1, mm3
+    paddw     mm0, mm1
 
     movd      retrd, mm0
 
-	WELSEMMS
+    WELSEMMS
     LOAD_4_PARA_POP
     ret
--- a/codec/common/x86/vaa.asm
+++ b/codec/common/x86/vaa.asm
@@ -29,16 +29,16 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	vaa.asm
+;*  vaa.asm
 ;*
-;*	Abstract
+;*  Abstract
 ;*      sse2 for pVaa routines
 ;*
 ;*  History
-;*      04/14/2010	Created
-;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
-;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*      04/14/2010  Created
+;*      06/07/2010  Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*      06/10/2010  Tune rc_sad_frame_sse2 and got about 40% improvement
+;*      08/11/2010  Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -49,87 +49,87 @@
 ;***********************************************************************
 
 ; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
+%macro SUM_WORD_8x2_SSE2    2   ; dst(pSrc), tmp
+    ; @sum_8x2 begin
+    pshufd %2, %1, 04Eh ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 04Eh    ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 0B1h    ; 10110001 B
+    paddw %1, %2
+    ; end of @sum_8x2
+%endmacro   ; END of SUM_WORD_8x2_SSE2
 
 
 %macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [r0    ]	; line 0
-	movdqa %2, [r0+r1]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [r0+r2]	; line 2
-	movdqa %4, [r0+r3]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $04
+    movdqa %1, [r0    ] ; line 0
+    movdqa %2, [r0+r1]  ; line 1
+    movdqa %3, %1
+    punpcklbw %1, xmm7
+    punpckhbw %3, xmm7
+    movdqa %4, %2
+    punpcklbw %4, xmm7
+    punpckhbw %2, xmm7
+    paddw %1, %4
+    paddw %2, %3
+    movdqa %3, [r0+r2]  ; line 2
+    movdqa %4, [r0+r3]  ; line 3
+    movdqa %5, %3
+    punpcklbw %3, xmm7
+    punpckhbw %5, xmm7
+    movdqa %6, %4
+    punpcklbw %6, xmm7
+    punpckhbw %4, xmm7
+    paddw %3, %6
+    paddw %4, %5
+    paddw %1, %3    ; block 0, 1
+    paddw %2, %4    ; block 2, 3
+    pshufd %3, %1, 0B1h
+    pshufd %4, %2, 0B1h
+    paddw %1, %3
+    paddw %2, %4
+    movdqa %3, %1
+    movdqa %4, %2
+    pshuflw %5, %1, 0B1h
+    pshufhw %6, %3, 0B1h
+    paddw %1, %5
+    paddw %3, %6
+    pshuflw %5, %2, 0B1h
+    pshufhw %6, %4, 0B1h
+    paddw %2, %5
+    paddw %4, %6
+    punpcklwd %1, %2
+    punpckhwd %3, %4
+    punpcklwd %1, %3
+    psraw %1, $04
 %endmacro
 
 %macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [r0    ]	; line 0
-	movdqa %2, [r0+r1]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [r0+r2]	; line 2
-	movdqa %4, [r0+r3]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $04
+    movdqa %1, [r0    ] ; line 0
+    movdqa %2, [r0+r1]  ; line 1
+    movdqa %3, %1
+    punpcklbw %1, xmm7
+    punpckhbw %3, xmm7
+    movdqa %4, %2
+    punpcklbw %4, xmm7
+    punpckhbw %2, xmm7
+    paddw %1, %4
+    paddw %2, %3
+    movdqa %3, [r0+r2]  ; line 2
+    movdqa %4, [r0+r3]  ; line 3
+    movdqa %5, %3
+    punpcklbw %3, xmm7
+    punpckhbw %5, xmm7
+    movdqa %6, %4
+    punpcklbw %6, xmm7
+    punpckhbw %4, xmm7
+    paddw %3, %6
+    paddw %4, %5
+    paddw %1, %3    ; block 0, 1
+    paddw %2, %4    ; block 2, 3
+    phaddw %1, %2   ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+    phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+    psraw %1, $04
 %endmacro
 
 
@@ -143,7 +143,7 @@
 ; , 6/7/2010
 
 ;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
+;   int32_t AnalysisVaaInfoIntra_sse2(  uint8_t *pDataY, const int32_t iLineSize );
 ;***********************************************************************
 WELS_EXTERN AnalysisVaaInfoIntra_sse2
 
@@ -174,71 +174,71 @@
     mov r4,r2
     sal r4,$01   ;r4 = 4*iLineSize
 
-	pxor xmm7, xmm7
+    pxor xmm7, xmm7
 
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7], xmm0
+    ; loops
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7], xmm0
 
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+8], xmm0
+    lea r0, [r0+r4]
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+8], xmm0
 
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+16], xmm0
+    lea r0, [r0+r4]
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+16], xmm0
 
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+24], xmm0
+    lea r0, [r0+r4]
+    VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+24], xmm0
 
-	movdqa xmm0, [r7]		; block 0~7
-	movdqa xmm1, [r7+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
+    movdqa xmm0, [r7]       ; block 0~7
+    movdqa xmm1, [r7+16]    ; block 8~15
+    movdqa xmm2, xmm0
+    paddw xmm0, xmm1
+    SUM_WORD_8x2_SSE2 xmm0, xmm3
 
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
+    pmullw xmm1, xmm1
+    pmullw xmm2, xmm2
+    movdqa xmm3, xmm1
+    movdqa xmm4, xmm2
+    punpcklwd xmm1, xmm7
+    punpckhwd xmm3, xmm7
+    punpcklwd xmm2, xmm7
+    punpckhwd xmm4, xmm7
+    paddd xmm1, xmm2
+    paddd xmm3, xmm4
+    paddd xmm1, xmm3
+    pshufd xmm2, xmm1, 01Bh
+    paddd xmm1, xmm2
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
 
 
 
-	movd r2d, xmm0
-	and r2, 0ffffh		; effective low work truncated
-	mov r3, r2
-	imul r2, r3
-	sar r2, $04
-	movd retrd, xmm1
-	sub retrd, r2d
+    movd r2d, xmm0
+    and r2, 0ffffh      ; effective low work truncated
+    mov r3, r2
+    imul r2, r3
+    sar r2, $04
+    movd retrd, xmm1
+    sub retrd, r2d
 
-	add r7,32
-	add r7,r5
+    add r7,32
+    add r7,r5
 
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	POP_XMM
+    POP_XMM
 
-	ret
+    ret
 
 ;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
+;   int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
 ;***********************************************************************
 WELS_EXTERN AnalysisVaaInfoIntra_ssse3
 
@@ -269,47 +269,47 @@
     mov r4,r2
     sal r4,$01   ;r4 = 4*iLineSize
 
-	pxor xmm7, xmm7
+    pxor xmm7, xmm7
 
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    ; loops
+    VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7],xmm0
 
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    lea r0,[r0+r4]
+    VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+8],xmm1
 
 
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    lea r0,[r0+r4]
+    VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7+16],xmm0
 
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    lea r0,[r0+r4]
+    VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+24],xmm1
 
 
-	movdqa xmm0,[r7]
-	movdqa xmm1,[r7+16]
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+    movdqa xmm0,[r7]
+    movdqa xmm1,[r7+16]
+    movdqa xmm2, xmm0
+    paddw xmm0, xmm1
+    SUM_WORD_8x2_SSE2 xmm0, xmm3    ; better performance than that of phaddw sets
 
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
+    pmullw xmm1, xmm1
+    pmullw xmm2, xmm2
+    movdqa xmm3, xmm1
+    movdqa xmm4, xmm2
+    punpcklwd xmm1, xmm7
+    punpckhwd xmm3, xmm7
+    punpcklwd xmm2, xmm7
+    punpckhwd xmm4, xmm7
+    paddd xmm1, xmm2
+    paddd xmm3, xmm4
+    paddd xmm1, xmm3
+    pshufd xmm2, xmm1, 01Bh
+    paddd xmm1, xmm2
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
 
 
     movd r2d, xmm0
@@ -318,94 +318,94 @@
     imul r2, r3
     sar r2, $04
     movd retrd, xmm1
-	sub retrd, r2d
+    sub retrd, r2d
 
-	add r7,32
-	add r7,r5
+    add r7,32
+    add r7,r5
 %ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
+    pop r6
+    pop r5
+    pop r4
+    pop r3
 %endif
-	POP_XMM
+    POP_XMM
 
-	ret
+    ret
 
 ;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;   uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
 ;***********************************************************************
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa xmm0,[r0]
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
-	pshufd xmm4, xmm3, 01Bh
-	paddd xmm4, xmm3
-	pshufd xmm3, xmm4, 0B1h
-	paddd xmm3, xmm4
-	movd r0d, xmm3
-	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa xmm0,[r0]
+    pshufd xmm1, xmm0, 01Bh
+    paddd xmm1, xmm0
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
+    psrad xmm1, 02h     ; iAverageSad
+    movdqa xmm2, xmm1
+    psrad xmm2, 06h
+    movdqa xmm3, xmm0   ; iSadBlock
+    psrad xmm3, 06h
+    psubd xmm3, xmm2
+    pmulld xmm3, xmm3   ; [comment]: pmulld from SSE4.1 instruction sets
+    pshufd xmm4, xmm3, 01Bh
+    paddd xmm4, xmm3
+    pshufd xmm3, xmm4, 0B1h
+    paddd xmm3, xmm4
+    movd r0d, xmm3
+    cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
 
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 01Bh
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps retrd, xmm0
-	ret
+    jb near .threshold_exit
+    pshufd xmm0, xmm0, 01Bh
+    pcmpgtd xmm0, xmm1  ; iSadBlock > iAverageSad
+    movmskps retrd, xmm0
+    ret
 .threshold_exit:
-	mov retrd, 15
-	ret
+    mov retrd, 15
+    ret
 
 ;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;   uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
 ;***********************************************************************
 WELS_EXTERN MdInterAnalysisVaaInfo_sse2
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa xmm0, [r0]
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa xmm0, [r0]
+    pshufd xmm1, xmm0, 01Bh
+    paddd xmm1, xmm0
+    pshufd xmm2, xmm1, 0B1h
+    paddd xmm1, xmm2
+    psrad xmm1, 02h     ; iAverageSad
+    movdqa xmm2, xmm1
+    psrad xmm2, 06h
+    movdqa xmm3, xmm0   ; iSadBlock
+    psrad xmm3, 06h
+    psubd xmm3, xmm2
 
-	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3
-	pmuludq xmm2, xmm3
-	pshufd xmm4, xmm3, 0B1h
-	pmuludq xmm4, xmm4
-	movdqa xmm5, xmm2
-	punpckldq xmm5, xmm4
-	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2
+    ; to replace pmulld functionality as below
+    movdqa xmm2, xmm3
+    pmuludq xmm2, xmm3
+    pshufd xmm4, xmm3, 0B1h
+    pmuludq xmm4, xmm4
+    movdqa xmm5, xmm2
+    punpckldq xmm5, xmm4
+    punpckhdq xmm2, xmm4
+    punpcklqdq xmm5, xmm2
 
-	pshufd xmm4, xmm5, 01Bh
-	paddd xmm4, xmm5
-	pshufd xmm5, xmm4, 0B1h
-	paddd xmm5, xmm4
+    pshufd xmm4, xmm5, 01Bh
+    paddd xmm4, xmm5
+    pshufd xmm5, xmm4, 0B1h
+    paddd xmm5, xmm4
 
-	movd r0d, xmm5
-	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 01Bh
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps retrd, xmm0
-	ret
+    movd r0d, xmm5
+    cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+    jb near .threshold_exit
+    pshufd xmm0, xmm0, 01Bh
+    pcmpgtd xmm0, xmm1  ; iSadBlock > iAverageSad
+    movmskps retrd, xmm0
+    ret
 .threshold_exit:
-	mov retrd, 15
-	ret
+    mov retrd, 15
+    ret
--- a/codec/decoder/core/arm/block_add_neon.S
+++ b/codec/decoder/core/arm/block_add_neon.S
@@ -35,68 +35,68 @@
 #include "arm_arch_common_macro.S"
 #ifdef __APPLE__
 
-.macro	ROW_TRANSFORM_1_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		$8, $1, #1
-    vshr.s16		$9, $3, #1
-    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+.macro ROW_TRANSFORM_1_STEP
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        $8, $1, #1
+    vshr.s16        $9, $3, #1
+    vsubl.s16       $6, $8, $3          //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       $7, $1, $9          //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro	TRANSFORM_4BYTES	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
-.macro	COL_TRANSFORM_1_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		$6, $1, #1
-    vshr.s32		$7, $3, #1
-    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+.macro COL_TRANSFORM_1_STEP
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        $6, $1, #1
+    vshr.s32        $7, $3, #1
+    vsub.s32        $6, $6, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        $7, $1, $7          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 
 #else
 
-.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		\arg8, \arg1, #1
-    vshr.s16		\arg9, \arg3, #1
-    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        \arg8, \arg1, #1
+    vshr.s16        \arg9, \arg3, #1
+    vsubl.s16       \arg6, \arg8, \arg3         //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       \arg7, \arg1, \arg9         //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro	TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES  arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
-.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		\arg6, \arg1, #1
-    vshr.s32		\arg7, \arg3, #1
-    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        \arg6, \arg1, #1
+    vshr.s32        \arg7, \arg3, #1
+    vsub.s32        \arg6, \arg6, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 #endif
 // r0    int16_t* block,
@@ -103,61 +103,61 @@
 // r1    int8_t* non_zero_count,
 WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
 
-	vld1.64	{d0-d2}, [r1]
+    vld1.64 {d0-d2}, [r1]
 
-	vceq.s8	q0, q0, #0
-	vceq.s8	d2, d2, #0
-	vmvn	q0, q0
-	vmvn	d2, d2
-	vabs.s8	q0, q0
-	vabs.s8	d2, d2
+    vceq.s8 q0, q0, #0
+    vceq.s8 d2, d2, #0
+    vmvn    q0, q0
+    vmvn    d2, d2
+    vabs.s8 q0, q0
+    vabs.s8 d2, d2
 
-	vst1.64	{d0-d2}, [r1]
+    vst1.64 {d0-d2}, [r1]
 WELS_ASM_FUNC_END
 
 
-//	uint8_t *pred, const int32_t stride, int16_t *rs
+//  uint8_t *pred, const int32_t stride, int16_t *rs
 WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
 
-	vld4.s16		{d0, d1, d2, d3}, [r2]		// cost 3 cycles!
+    vld4.s16        {d0, d1, d2, d3}, [r2]      // cost 3 cycles!
 
-	ROW_TRANSFORM_1_STEP		d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
+    ROW_TRANSFORM_1_STEP        d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
 
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_4BYTES        q0, q1, q2, q3, q8, q9, q10, q11
 
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+    // transform element 32bits
+    vtrn.s32        q0, q1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s32        q2, q3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vswp            d1, d4              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vswp            d3, d6              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
 
-	COL_TRANSFORM_1_STEP		q0, q1, q2, q3, q8, q9, q10, q11
+    COL_TRANSFORM_1_STEP        q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_4BYTES		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_4BYTES        q0, q1, q2, q3, q8, q9, q10, q11
 
-	//after clip_table[MAX_NEG_CROP] into [0, 255]
-	mov			r2, r0
-	vld1.32		{d20[0]},[r0],r1
-	vld1.32		{d20[1]},[r0],r1
-	vld1.32		{d22[0]},[r0],r1
-	vld1.32		{d22[1]},[r0]
+    //after clip_table[MAX_NEG_CROP] into [0, 255]
+    mov         r2, r0
+    vld1.32     {d20[0]},[r0],r1
+    vld1.32     {d20[1]},[r0],r1
+    vld1.32     {d22[0]},[r0],r1
+    vld1.32     {d22[1]},[r0]
 
-	vrshrn.s32		d16, q0, #6
-	vrshrn.s32		d17, q1, #6
-	vrshrn.s32		d18, q2, #6
-	vrshrn.s32		d19, q3, #6
+    vrshrn.s32      d16, q0, #6
+    vrshrn.s32      d17, q1, #6
+    vrshrn.s32      d18, q2, #6
+    vrshrn.s32      d19, q3, #6
 
-	vmovl.u8		q0,d20
-	vmovl.u8		q1,d22
-	vadd.s16		q0,q8
-	vadd.s16		q1,q9
+    vmovl.u8        q0,d20
+    vmovl.u8        q1,d22
+    vadd.s16        q0,q8
+    vadd.s16        q1,q9
 
-	vqmovun.s16		d20,q0
-	vqmovun.s16		d22,q1
+    vqmovun.s16     d20,q0
+    vqmovun.s16     d22,q1
 
-	vst1.32		{d20[0]},[r2],r1
-	vst1.32		{d20[1]},[r2],r1
-	vst1.32		{d22[0]},[r2],r1
-	vst1.32		{d22[1]},[r2]
+    vst1.32     {d20[0]},[r2],r1
+    vst1.32     {d20[1]},[r2],r1
+    vst1.32     {d22[0]},[r2],r1
+    vst1.32     {d22[1]},[r2]
 WELS_ASM_FUNC_END
 #endif
--- a/codec/decoder/core/arm/intra_pred_neon.S
+++ b/codec/decoder/core/arm/intra_pred_neon.S
@@ -38,45 +38,45 @@
 #ifdef __APPLE__
 //Global macro
 .macro GET_8BYTE_DATA
-	vld1.8 {$0[0]}, [$1], $2
-	vld1.8 {$0[1]}, [$1], $2
-	vld1.8 {$0[2]}, [$1], $2
-	vld1.8 {$0[3]}, [$1], $2
-	vld1.8 {$0[4]}, [$1], $2
-	vld1.8 {$0[5]}, [$1], $2
-	vld1.8 {$0[6]}, [$1], $2
-	vld1.8 {$0[7]}, [$1], $2
+    vld1.8 {$0[0]}, [$1], $2
+    vld1.8 {$0[1]}, [$1], $2
+    vld1.8 {$0[2]}, [$1], $2
+    vld1.8 {$0[3]}, [$1], $2
+    vld1.8 {$0[4]}, [$1], $2
+    vld1.8 {$0[5]}, [$1], $2
+    vld1.8 {$0[6]}, [$1], $2
+    vld1.8 {$0[7]}, [$1], $2
 .endmacro
 #else
 //Global macro
 .macro GET_8BYTE_DATA arg0, arg1, arg2
-	vld1.8 {\arg0[0]}, [\arg1], \arg2
-	vld1.8 {\arg0[1]}, [\arg1], \arg2
-	vld1.8 {\arg0[2]}, [\arg1], \arg2
-	vld1.8 {\arg0[3]}, [\arg1], \arg2
-	vld1.8 {\arg0[4]}, [\arg1], \arg2
-	vld1.8 {\arg0[5]}, [\arg1], \arg2
-	vld1.8 {\arg0[6]}, [\arg1], \arg2
-	vld1.8 {\arg0[7]}, [\arg1], \arg2
+    vld1.8 {\arg0[0]}, [\arg1], \arg2
+    vld1.8 {\arg0[1]}, [\arg1], \arg2
+    vld1.8 {\arg0[2]}, [\arg1], \arg2
+    vld1.8 {\arg0[3]}, [\arg1], \arg2
+    vld1.8 {\arg0[4]}, [\arg1], \arg2
+    vld1.8 {\arg0[5]}, [\arg1], \arg2
+    vld1.8 {\arg0[6]}, [\arg1], \arg2
+    vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
-	//Get the top line data to 'q0'
-	sub  r2, r0, r1
-	vldm r2, {d0, d1}
+    //Get the top line data to 'q0'
+    sub  r2, r0, r1
+    vldm r2, {d0, d1}
 
-	mov  r2, r0
-	mov  r3, #4
-	//Set the top line to the each line of MB(16*16)
+    mov  r2, r0
+    mov  r3, #4
+    //Set the top line to the each line of MB(16*16)
 loop_0_get_i16x16_luma_pred_v:
-	vst1.8 {d0,d1}, [r2], r1
-	vst1.8 {d0,d1}, [r2], r1
-	vst1.8 {d0,d1}, [r2], r1
-	vst1.8 {d0,d1}, [r2], r1
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_v
+    vst1.8 {d0,d1}, [r2], r1
+    vst1.8 {d0,d1}, [r2], r1
+    vst1.8 {d0,d1}, [r2], r1
+    vst1.8 {d0,d1}, [r2], r1
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_v
 
 WELS_ASM_FUNC_END
 
@@ -83,59 +83,59 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
-	sub  r2, r0, #1
-	mov  r3, #4
+    sub  r2, r0, #1
+    mov  r3, #4
 loop_0_get_i16x16_luma_pred_h:
-	//Get one byte data from left side
-	vld1.8 {d0[],d1[]}, [r2], r1
-	vld1.8 {d2[],d3[]}, [r2], r1
-	vld1.8 {d4[],d5[]}, [r2], r1
-	vld1.8 {d6[],d7[]}, [r2], r1
+    //Get one byte data from left side
+    vld1.8 {d0[],d1[]}, [r2], r1
+    vld1.8 {d2[],d3[]}, [r2], r1
+    vld1.8 {d4[],d5[]}, [r2], r1
+    vld1.8 {d6[],d7[]}, [r2], r1
 
-	//Set the line of MB using the left side byte data
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d2,d3}, [r0], r1
-	vst1.8 {d4,d5}, [r0], r1
-	vst1.8 {d6,d7}, [r0], r1
+    //Set the line of MB using the left side byte data
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d2,d3}, [r0], r1
+    vst1.8 {d4,d5}, [r0], r1
+    vst1.8 {d6,d7}, [r0], r1
 
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_h
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_h
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the left vertical line data
-	sub r2, r0, #1
-	GET_8BYTE_DATA d0, r2, r1
-	GET_8BYTE_DATA d1, r2, r1
+    //stmdb sp!, { r2-r5, lr}
+    //Get the left vertical line data
+    sub r2, r0, #1
+    GET_8BYTE_DATA d0, r2, r1
+    GET_8BYTE_DATA d1, r2, r1
 
-	//Get the top horizontal line data
-	sub  r2, r0, r1
-	vldm r2, {d2, d3}
+    //Get the top horizontal line data
+    sub  r2, r0, r1
+    vldm r2, {d2, d3}
 
-	//Calculate the sum of top horizontal line data and vertical line data
-	vpaddl.u8 q0, q0
-	vpaddl.u8 q1, q1
-	vadd.u16  q0, q0, q1
-	vadd.u16  d0, d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the sum of top horizontal line data and vertical line data
+    vpaddl.u8 q0, q0
+    vpaddl.u8 q1, q1
+    vadd.u16  q0, q0, q1
+    vadd.u16  d0, d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, d0, #5
-	vdup.8     q0, d0[0]
+    //Calculate the mean value
+    vrshr.u16  d0, d0, #5
+    vdup.8     q0, d0[0]
 
-	//Set the mean value to the all of member of MB
-	mov  r2, #4
+    //Set the mean value to the all of member of MB
+    mov  r2, #4
 loop_0_get_i16x16_luma_pred_dc_both:
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d0,d1}, [r0], r1
-	vst1.8 {d0,d1}, [r0], r1
-	subs  r2, #1
-	bne  loop_0_get_i16x16_luma_pred_dc_both
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d0,d1}, [r0], r1
+    vst1.8 {d0,d1}, [r0], r1
+    subs  r2, #1
+    bne  loop_0_get_i16x16_luma_pred_dc_both
 
 WELS_ASM_FUNC_END
 
@@ -149,106 +149,106 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
-	//stmdb sp!, { r2-r5, lr}
+    //stmdb sp!, { r2-r5, lr}
 
-	//Load the table {(8,7,6,5,4,3,2,1) * 5}
-	adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
-	vldr    d0, [r2]
+    //Load the table {(8,7,6,5,4,3,2,1) * 5}
+    adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
+    vldr    d0, [r2]
 
-	//Pack the top[-1] ~ top[6] to d1
-	sub       r2,  r0, r1
-	sub       r3,  r2, #1
-	vld1.8    d1, [r3]
+    //Pack the top[-1] ~ top[6] to d1
+    sub       r2,  r0, r1
+    sub       r3,  r2, #1
+    vld1.8    d1, [r3]
 
-	//Pack the top[8] ~ top[15] to d2
-	add       r3, #9
-	vld1.8    d2, [r3]
+    //Pack the top[8] ~ top[15] to d2
+    add       r3, #9
+    vld1.8    d2, [r3]
 
-	//Save the top[15] to d6 for next step
-	vdup.u8   d6,   d2[7]
+    //Save the top[15] to d6 for next step
+    vdup.u8   d6,   d2[7]
 
-	//Get and pack left[-1] ~ left[6] to d4
-	sub       r3,  r2, #1
-	GET_8BYTE_DATA d4, r3, r1
+    //Get and pack left[-1] ~ left[6] to d4
+    sub       r3,  r2, #1
+    GET_8BYTE_DATA d4, r3, r1
 
-	//Get and pack left[8] ~ left[15] to d3
-	add       r3,  r1
-	GET_8BYTE_DATA d3, r3, r1
+    //Get and pack left[8] ~ left[15] to d3
+    add       r3,  r1
+    GET_8BYTE_DATA d3, r3, r1
 
-	//Save the left[15] to d7 for next step
-	vdup.u8   d7,   d3[7]
+    //Save the left[15] to d7 for next step
+    vdup.u8   d7,   d3[7]
 
-	//revert the sequence of d2,d3
-	vrev64.8   q1, q1
+    //revert the sequence of d2,d3
+    vrev64.8   q1, q1
 
-	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
-	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+    vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+    vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
 
 
-	vmovl.u8   q0, d0
-	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
-	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+    vmovl.u8   q0, d0
+    vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+    vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
 
-	//Calculate the sum of items of q1, q2
-	vpadd.s16  d0, d2, d3
-	vpadd.s16  d1, d4, d5
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
+    //Calculate the sum of items of q1, q2
+    vpadd.s16  d0, d2, d3
+    vpadd.s16  d1, d4, d5
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
 
-	//Get the value of 'b', 'c' and extend to q1, q2.
-	vrshr.s64  q0, #6
-	vdup.s16   q1, d0[0]
-	vdup.s16   q2, d1[0]
+    //Get the value of 'b', 'c' and extend to q1, q2.
+    vrshr.s64  q0, #6
+    vdup.s16   q1, d0[0]
+    vdup.s16   q2, d1[0]
 
-	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
-	adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
-	vld1.32   {d0}, [r2]
+    //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+    adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
+    vld1.32   {d0}, [r2]
 
-	//Get the value of 'a' and save to q3
-	vaddl.u8  q3, d6, d7
-	vshl.u16  q3, #4
+    //Get the value of 'a' and save to q3
+    vaddl.u8  q3, d6, d7
+    vshl.u16  q3, #4
 
-	//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
-	vmovl.s8  q0, d0
-	vmla.s16  q3, q0, q1
-	vmla.s16  q3, q2, d0[0]
+    //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+    vmovl.s8  q0, d0
+    vmla.s16  q3, q0, q1
+    vmla.s16  q3, q2, d0[0]
 
-	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
-	vshl.s16  q8, q1, #3
-	vadd.s16  q8, q3
+    //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+    vshl.s16  q8, q1, #3
+    vadd.s16  q8, q3
 
-	//right shift 5 bits and rounding
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
+    //right shift 5 bits and rounding
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
 
-	//Set the line of MB
-	vst1.u32  {d0,d1}, [r0], r1
+    //Set the line of MB
+    vst1.u32  {d0,d1}, [r0], r1
 
 
-	//Do the same processing for setting other lines
-	mov  r2, #15
+    //Do the same processing for setting other lines
+    mov  r2, #15
 loop_0_get_i16x16_luma_pred_plane:
-	vadd.s16  q3, q2
-	vadd.s16  q8, q2
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
-	vst1.u32  {d0,d1}, [r0], r1
-	subs  r2, #1
-	bne  loop_0_get_i16x16_luma_pred_plane
+    vadd.s16  q3, q2
+    vadd.s16  q8, q2
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
+    vst1.u32  {d0,d1}, [r0], r1
+    subs  r2, #1
+    bne  loop_0_get_i16x16_luma_pred_plane
 
 WELS_ASM_FUNC_END
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub  r2, r0, r1
-	ldr  r2, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub  r2, r0, r1
+    ldr  r2, [r2]
 
-	//Set the luma MB using top line
-	str  r2, [r0], r1
-	str  r2, [r0], r1
-	str  r2, [r0], r1
-	str  r2, [r0]
+    //Set the luma MB using top line
+    str  r2, [r0], r1
+    str  r2, [r0], r1
+    str  r2, [r0], r1
+    str  r2, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -255,97 +255,97 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the left column (4 bytes)
-	sub  r2, r0, #1
-	vld1.8 {d0[]}, [r2], r1
-	vld1.8 {d1[]}, [r2], r1
-	vld1.8 {d2[]}, [r2], r1
-	vld1.8 {d3[]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column (4 bytes)
+    sub  r2, r0, #1
+    vld1.8 {d0[]}, [r2], r1
+    vld1.8 {d1[]}, [r2], r1
+    vld1.8 {d2[]}, [r2], r1
+    vld1.8 {d3[]}, [r2]
 
-	//Set the luma MB using the left side byte
-	vst1.32 {d0[0]}, [r0], r1
-	vst1.32 {d1[0]}, [r0], r1
-	vst1.32 {d2[0]}, [r0], r1
-	vst1.32 {d3[0]}, [r0]
+    //Set the luma MB using the left side byte
+    vst1.32 {d0[0]}, [r0], r1
+    vst1.32 {d1[0]}, [r0], r1
+    vst1.32 {d2[0]}, [r0], r1
+    vst1.32 {d3[0]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data(8 bytes)
-	sub    r2,  r0, r1
-	vld1.32  {d0}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data(8 bytes)
+    sub    r2,  r0, r1
+    vld1.32  {d0}, [r2]
 
-	//For "t7 + (t7<<1)"
-	vdup.8   d1,  d0[7]
+    //For "t7 + (t7<<1)"
+    vdup.8   d1,  d0[7]
 
-	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
-	vext.8   d1,  d0, d1, #1
-	vaddl.u8 q1,  d1, d0
+    //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+    vext.8   d1,  d0, d1, #1
+    vaddl.u8 q1,  d1, d0
 
-	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
-	vext.8   q2,  q1, q1, #14
-	vadd.u16 q0,  q1, q2
+    //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+    vext.8   q2,  q1, q1, #14
+    vadd.u16 q0,  q1, q2
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16  d0,  q0, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16  d0,  q0, #2
 
-	//Save "ddl0, ddl1, ddl2, ddl3"
-	vext.8   d1, d0, d0, #1
-	vst1.32  d1[0], [r0], r1
+    //Save "ddl0, ddl1, ddl2, ddl3"
+    vext.8   d1, d0, d0, #1
+    vst1.32  d1[0], [r0], r1
 
-	//Save "ddl1, ddl2, ddl3, ddl4"
-	vext.8   d1, d0, d0, #2
-	vst1.32  d1[0], [r0], r1
+    //Save "ddl1, ddl2, ddl3, ddl4"
+    vext.8   d1, d0, d0, #2
+    vst1.32  d1[0], [r0], r1
 
-	//Save "ddl2, ddl3, ddl4, ddl5"
-	vext.8   d1, d0, d0, #3
-	vst1.32  d1[0], [r0], r1
+    //Save "ddl2, ddl3, ddl4, ddl5"
+    vext.8   d1, d0, d0, #3
+    vst1.32  d1[0], [r0], r1
 
-	//Save "ddl3, ddl4, ddl5, ddl6"
-	vst1.32  d0[1], [r0]
+    //Save "ddl3, ddl4, ddl5, ddl6"
+    vst1.32  d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub    r2,  r0, r1
-	vld1.32  {d0[1]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub    r2,  r0, r1
+    vld1.32  {d0[1]}, [r2]
 
-	//Load the left column (5 bytes)
-	sub    r2,  #1
-	vld1.8 {d0[3]}, [r2], r1
-	vld1.8 {d0[2]}, [r2], r1
-	vld1.8 {d0[1]}, [r2], r1
-	vld1.8 {d0[0]}, [r2], r1
-	vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
+    //Load the left column (5 bytes)
+    sub    r2,  #1
+    vld1.8 {d0[3]}, [r2], r1
+    vld1.8 {d0[2]}, [r2], r1
+    vld1.8 {d0[1]}, [r2], r1
+    vld1.8 {d0[0]}, [r2], r1
+    vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
 
 
-	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
-	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+    vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+                              //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
 
-	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
-	vaddl.u8 q2, d2, d0
+    //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+    vaddl.u8 q2, d2, d0
 
-	//q1:{TL0+LT0,LT0+T01,...L12+L23}
-	vext.8   q3, q3, q2, #14
-	vadd.u16 q1, q2, q3
+    //q1:{TL0+LT0,LT0+T01,...L12+L23}
+    vext.8   q3, q3, q2, #14
+    vadd.u16 q1, q2, q3
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16 d0, q1, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16 d0, q1, #2
 
-	//Adjust the data sequence for setting luma MB of 'pred'
-	vst1.32   d0[1], [r0], r1
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0], r1
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0], r1
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]
+    //Adjust the data sequence for setting luma MB of 'pred'
+    vst1.32   d0[1], [r0], r1
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0], r1
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0], r1
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
@@ -352,31 +352,31 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (8 bytes)
-	sub    r2,  r0, r1
-	vld1.32  {d0}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (8 bytes)
+    sub    r2,  r0, r1
+    vld1.32  {d0}, [r2]
 
 
-	vext.8   d1,  d0, d0, #1
-	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+    vext.8   d1,  d0, d0, #1
+    vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
 
-	vext.8   q2,  q1, q1, #2
-	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+    vext.8   q2,  q1, q1, #2
+    vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
 
-	//calculate the "vl0,vl1,vl2,vl3,vl4"
-	vqrshrn.u16  d0,  q1, #1
+    //calculate the "vl0,vl1,vl2,vl3,vl4"
+    vqrshrn.u16  d0,  q1, #1
 
-	//calculate the "vl5,vl6,vl7,vl8,vl9"
-	vqrshrn.u16  d1,  q2, #2
+    //calculate the "vl5,vl6,vl7,vl8,vl9"
+    vqrshrn.u16  d1,  q2, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[0], [r0], r1
-	vst1.32  d1[0], [r0], r1
-	vext.8   d0,  d0, d0, #1
-	vext.8   d1,  d1, d1, #1
-	vst1.32  d0[0], [r0], r1
-	vst1.32  d1[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[0], [r0], r1
+    vst1.32  d1[0], [r0], r1
+    vext.8   d0,  d0, d0, #1
+    vext.8   d1,  d1, d1, #1
+    vst1.32  d0[0], [r0], r1
+    vst1.32  d1[0], [r0]
 
 WELS_ASM_FUNC_END
 
@@ -383,152 +383,152 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub       r2,  r0, r1
-	vld1.32   {d0[1]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub       r2,  r0, r1
+    vld1.32   {d0[1]}, [r2]
 
-	//Load the left column (4 bytes)
-	sub       r2,  #1
-	vld1.8    {d0[3]}, [r2], r1
-	vld1.8    {d0[2]}, [r2], r1
-	vld1.8    {d0[1]}, [r2], r1
-	vld1.8    {d0[0]}, [r2]
+    //Load the left column (4 bytes)
+    sub       r2,  #1
+    vld1.8    {d0[3]}, [r2], r1
+    vld1.8    {d0[2]}, [r2], r1
+    vld1.8    {d0[1]}, [r2], r1
+    vld1.8    {d0[0]}, [r2]
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
 
-	vext.u8   q2, q1, q1, #14
-	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+    vext.u8   q2, q1, q1, #14
+    vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
 
-	//Calculate the vr0 ~ vr9
-	vqrshrn.u16 d1, q2, #2
-	vqrshrn.u16 d0, q1, #1
+    //Calculate the vr0 ~ vr9
+    vqrshrn.u16 d1, q2, #2
+    vqrshrn.u16 d0, q1, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[1], [r0], r1
-	vst1.32  d1[1], [r0], r1
-	add    r2, r0, r1
-	vst1.8   d1[3], [r0]!
-	vst1.16  d0[2], [r0]!
-	vst1.8   d0[6], [r0]!
-	vst1.8   d1[2], [r2]!
-	vst1.16  d1[2], [r2]!
-	vst1.8   d1[6], [r2]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[1], [r0], r1
+    vst1.32  d1[1], [r0], r1
+    add    r2, r0, r1
+    vst1.8   d1[3], [r0]!
+    vst1.16  d0[2], [r0]!
+    vst1.8   d0[6], [r0]!
+    vst1.8   d1[2], [r2]!
+    vst1.16  d1[2], [r2]!
+    vst1.8   d1[6], [r2]
 WELS_ASM_FUNC_END
 
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the left column data
-	sub       r2,  r0, #1
-	mov       r3,  #3
-	mul       r3,  r1
-	add       r3,  r2
-	vld1.8    {d0[]},  [r3]
-	vld1.8    {d0[4]}, [r2], r1
-	vld1.8    {d0[5]}, [r2], r1
-	vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column data
+    sub       r2,  r0, #1
+    mov       r3,  #3
+    mul       r3,  r1
+    add       r3,  r2
+    vld1.8    {d0[]},  [r3]
+    vld1.8    {d0[4]}, [r2], r1
+    vld1.8    {d0[5]}, [r2], r1
+    vld1.8    {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
 
-	vext.8    d1, d0, d0, #1
-	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+    vext.8    d1, d0, d0, #1
+    vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
 
-	vext.u8   d2, d5, d4, #2
-	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+    vext.u8   d2, d5, d4, #2
+    vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
 
-	//Calculate the hu0 ~ hu5
-	vqrshrn.u16 d2, q2, #1
-	vqrshrn.u16 d1, q1, #2
+    //Calculate the hu0 ~ hu5
+    vqrshrn.u16 d2, q2, #1
+    vqrshrn.u16 d1, q1, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vzip.8   d2, d1
-	vst1.32  d1[0], [r0], r1
-	vext.8   d2, d1, d1, #2
-	vst1.32  d2[0], [r0], r1
-	vst1.32  d1[1], [r0], r1
-	vst1.32  d0[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vzip.8   d2, d1
+    vst1.32  d1[0], [r0], r1
+    vext.8   d2, d1, d1, #2
+    vst1.32  d2[0], [r0], r1
+    vst1.32  d1[1], [r0], r1
+    vst1.32  d0[0], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the data
-	sub       r2,  r0, r1
-	sub       r2,  #1
-	vld1.32   {d0[1]}, [r2], r1
-	vld1.8    {d0[3]}, [r2], r1
-	vld1.8    {d0[2]}, [r2], r1
-	vld1.8    {d0[1]}, [r2], r1
-	vld1.8    {d0[0]}, [r2]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+    //stmdb sp!, { r2-r5, lr}
+    //Load the data
+    sub       r2,  r0, r1
+    sub       r2,  #1
+    vld1.32   {d0[1]}, [r2], r1
+    vld1.8    {d0[3]}, [r2], r1
+    vld1.8    {d0[2]}, [r2], r1
+    vld1.8    {d0[1]}, [r2], r1
+    vld1.8    {d0[0]}, [r2]     //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
 
-	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
-	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+    vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+    vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
 
-	//Calculate the hd0~hd9
-	vqrshrn.u16 d1, q3, #2
-	vqrshrn.u16 d0, q2, #1
+    //Calculate the hd0~hd9
+    vqrshrn.u16 d1, q3, #2
+    vqrshrn.u16 d0, q2, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vmov      d3, d1
-	vtrn.8    d0, d1
-	vext.u8   d2, d1, d1, #6
-	vst2.16  {d2[3], d3[3]}, [r0], r1
-	vst2.16  {d0[2], d1[2]}, [r0], r1
-	vmov     d3, d0
-	vst2.16  {d2[2], d3[2]}, [r0], r1
-	vst2.16  {d0[1], d1[1]}, [r0]
+    //Adjust the data sequence for setting the luma MB
+    vmov      d3, d1
+    vtrn.8    d0, d1
+    vext.u8   d2, d1, d1, #6
+    vst2.16  {d2[3], d3[3]}, [r0], r1
+    vst2.16  {d0[2], d1[2]}, [r0], r1
+    vmov     d3, d0
+    vst2.16  {d2[2], d3[2]}, [r0], r1
+    vst2.16  {d0[1], d1[1]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the top row (8 byte)
-	sub  r2, r0, r1
-	vldr d0, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Get the top row (8 byte)
+    sub  r2, r0, r1
+    vldr d0, [r2]
 
-	//Set the chroma MB using top row data
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d0}, [r0]
+    //Set the chroma MB using top row data
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d0}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	////Get the left column (8 byte)
-	sub  r2, r0, #1
-	vld1.8 {d0[]}, [r2], r1
-	vld1.8 {d1[]}, [r2], r1
-	vld1.8 {d2[]}, [r2], r1
-	vld1.8 {d3[]}, [r2], r1
-	vld1.8 {d4[]}, [r2], r1
-	vld1.8 {d5[]}, [r2], r1
-	vld1.8 {d6[]}, [r2], r1
-	vld1.8 {d7[]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    ////Get the left column (8 byte)
+    sub  r2, r0, #1
+    vld1.8 {d0[]}, [r2], r1
+    vld1.8 {d1[]}, [r2], r1
+    vld1.8 {d2[]}, [r2], r1
+    vld1.8 {d3[]}, [r2], r1
+    vld1.8 {d4[]}, [r2], r1
+    vld1.8 {d5[]}, [r2], r1
+    vld1.8 {d6[]}, [r2], r1
+    vld1.8 {d7[]}, [r2]
 
-	//Set the chroma MB using left column data
-	vst1.8 {d0}, [r0], r1
-	vst1.8 {d1}, [r0], r1
-	vst1.8 {d2}, [r0], r1
-	vst1.8 {d3}, [r0], r1
-	vst1.8 {d4}, [r0], r1
-	vst1.8 {d5}, [r0], r1
-	vst1.8 {d6}, [r0], r1
-	vst1.8 {d7}, [r0]
+    //Set the chroma MB using left column data
+    vst1.8 {d0}, [r0], r1
+    vst1.8 {d1}, [r0], r1
+    vst1.8 {d2}, [r0], r1
+    vst1.8 {d3}, [r0], r1
+    vst1.8 {d4}, [r0], r1
+    vst1.8 {d5}, [r0], r1
+    vst1.8 {d6}, [r0], r1
+    vst1.8 {d7}, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -576,73 +576,73 @@
 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
 
 WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data
-	sub  r2, r0, #1
-	sub  r2, r1
-	vld1.32 {d1[0]}, [r2]
-	add  r2, #5
-	vld1.32 {d0[0]}, [r2]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data
+    sub  r2, r0, #1
+    sub  r2, r1
+    vld1.32 {d1[0]}, [r2]
+    add  r2, #5
+    vld1.32 {d0[0]}, [r2]
 
-	//Load the left column data
-	sub  r2, #5
-	vld1.8 {d1[4]}, [r2], r1
-	vld1.8 {d1[5]}, [r2], r1
-	vld1.8 {d1[6]}, [r2], r1
-	vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
-	add  r2, r1
-	vld1.8 {d0[4]}, [r2], r1
-	vld1.8 {d0[5]}, [r2], r1
-	vld1.8 {d0[6]}, [r2], r1
-	vld1.8 {d0[7]}, [r2]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+    //Load the left column data
+    sub  r2, #5
+    vld1.8 {d1[4]}, [r2], r1
+    vld1.8 {d1[5]}, [r2], r1
+    vld1.8 {d1[6]}, [r2], r1
+    vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+    add  r2, r1
+    vld1.8 {d0[4]}, [r2], r1
+    vld1.8 {d0[5]}, [r2], r1
+    vld1.8 {d0[6]}, [r2], r1
+    vld1.8 {d0[7]}, [r2]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
 
 
-	//Save T7 to d3 for next step
-	vdup.u8   d3,   d0[3]
-	//Save L7 to d4 for next step
-	vdup.u8   d4,   d0[7]
+    //Save T7 to d3 for next step
+    vdup.u8   d3,   d0[3]
+    //Save L7 to d4 for next step
+    vdup.u8   d4,   d0[7]
 
-	//Calculate the value of 'a' and save to q2
-	vaddl.u8  q2, d3, d4
-	vshl.u16  q2, #4
+    //Calculate the value of 'a' and save to q2
+    vaddl.u8  q2, d3, d4
+    vshl.u16  q2, #4
 
-	//Load the table {{1,2,3,4,1,2,3,4}*17}
-	adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d2}, [r2]
+    //Load the table {{1,2,3,4,1,2,3,4}*17}
+    adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d2}, [r2]
 
-	//Calculate the 'b','c', and save to q0
-	vrev32.8  d1, d1
-	vsubl.u8  q0, d0, d1
-	vmovl.u8   q1, d2
-	vmul.s16   q0, q1
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
-	vrshr.s64  q0, #5
+    //Calculate the 'b','c', and save to q0
+    vrev32.8  d1, d1
+    vsubl.u8  q0, d0, d1
+    vmovl.u8   q1, d2
+    vmul.s16   q0, q1
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
+    vrshr.s64  q0, #5
 
-	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
-	adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d6, d7}, [r2]
+    //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+    adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d6, d7}, [r2]
 
-	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
-	vdup.s16   q1, d1[0]
-	vdup.s16   q0, d0[0]
+    //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+    vdup.s16   q1, d1[0]
+    vdup.s16   q0, d0[0]
 
-	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
-	vmla.s16   q2, q0, q3
-	vmla.s16   q2, q1, d6[0]
-	vqrshrun.s16 d0, q2, #5
+    //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+    vmla.s16   q2, q0, q3
+    vmla.s16   q2, q1, d6[0]
+    vqrshrun.s16 d0, q2, #5
 
-	//Set a line of chroma MB
-	vst1.u32  {d0}, [r0], r1
+    //Set a line of chroma MB
+    vst1.u32  {d0}, [r0], r1
 
-	//Do the same processing for each line.
-	mov  r2, #7
+    //Do the same processing for each line.
+    mov  r2, #7
 loop_0_get_i_chroma_pred_plane:
-	vadd.s16   q2, q1
-	vqrshrun.s16 d0, q2, #5
-	vst1.u32  {d0}, [r0], r1
-	subs  r2, #1
-	bne  loop_0_get_i_chroma_pred_plane
+    vadd.s16   q2, q1
+    vqrshrun.s16 d0, q2, #5
+    vst1.u32  {d0}, [r0], r1
+    subs  r2, #1
+    bne  loop_0_get_i_chroma_pred_plane
 
 WELS_ASM_FUNC_END
 
--- a/codec/decoder/core/x86/dct.asm
+++ b/codec/decoder/core/x86/dct.asm
@@ -54,7 +54,7 @@
 %endmacro
 
 %macro MMX_SumSub 3
-	movq    %3, %2
+    movq    %3, %2
     psubw   %2, %1
     paddw   %1, %3
 %endmacro
@@ -62,8 +62,8 @@
 %macro MMX_IDCT 6
     MMX_SumSub      %4, %5, %6
     MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
+    MMX_SumSub      %1, %4, %6
+    MMX_SumSub      %3, %5, %6
 %endmacro
 
 
@@ -96,13 +96,13 @@
     movq    mm2, [r2+16]
     movq    mm3, [r2+24]
 
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+    MMX_IDCT            mm1, mm2, mm3, mm4, mm0, mm6
     MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+    MMX_IDCT            mm3, mm0, mm4, mm2, mm1, mm6
 
-    WELS_Zero			mm7
-    WELS_DW32			mm6
+    WELS_Zero           mm7
+    WELS_DW32           mm6
 
     MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [r0]
     MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [r0+r1]
@@ -111,5 +111,5 @@
     MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [r0+r1]
 
 
-	emms
+    emms
     ret
--- a/codec/decoder/core/x86/intra_pred.asm
+++ b/codec/decoder/core/x86/intra_pred.asm
@@ -36,10 +36,10 @@
 ;*
 ;*  History
 ;*      18/09/2009 Created
-;*		19/11/2010 Added
-;*					WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
-;*					WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
-;*					and WelsDecoderIChromaPredDcNA_mmx
+;*      19/11/2010 Added
+;*                  WelsDecoderI16x16LumaPredDcTop_sse2, WelsDecoderI16x16LumaPredDcNA_sse2,
+;*                  WelsDecoderIChromaPredDcLeft_mmx, WelsDecoderIChromaPredDcTop_sse2
+;*                  and WelsDecoderIChromaPredDcNA_mmx
 ;*
 ;*
 ;*************************************************************************/
@@ -50,11 +50,6 @@
 ;*******************************************************************************
 
 SECTION .rodata align=16
-%if 1
-	%define WELSEMMS	emms
-%else
-	%define WELSEMMS
-%endif
 
 align 16
 sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
@@ -70,7 +65,7 @@
 sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
 
 align 16
-mmx_01bytes:		times 16	db 1
+mmx_01bytes:        times 16    db 1
 
 align 16
 mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -86,86 +81,86 @@
 ;xmm0, xmm1, xmm2, eax, ecx
 ;lower 64 bits of xmm0 save the result
 %macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
+    movd        %1, [%4-1]
+    movdqa      %3, %1
+    punpcklbw   %1, %3
+    movdqa      %3, %1
+    punpcklbw   %1, %3
 
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
+    ;add            %4, %5
+    movd        %2, [%4+%5-1]
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    punpckldq   %1, %2
 %endmacro
 
 
-%macro	LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
+%macro LOAD_COLUMN 6
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpcklwd %1,   %3
+    lea     %5, [%5+2*%6]
+    movd    %4, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %4,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    lea     %5, [%5+2*%6]
+    punpcklbw %3,   %2
+    punpcklwd %4,   %3
+    punpckhdq %1,   %4
 %endmacro
 
-%macro  SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%macro SUMW_HORIZON 3
+    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
+    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
+    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
+    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
+    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
+    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
+    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
 %endmacro
 
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+%macro COPY_16_TIMES 2
+    movdqa      %2, [%1-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+%macro COPY_16_TIMESS 3
+    movdqa      %2, [%1+%3-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
-%macro	LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
+%macro LOAD_COLUMN_C 6
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,%2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpckhwd %1,   %3
+    lea     %5, [%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
-        lea         r0, [r0+2*r1]
-        movzx		r3, byte [r0-0x01]
-        add			r2, r3
-        movzx		r3, byte [r0+r1-0x01]
-        add			r2, r3
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]
+    add         r2, r3
+    movzx       r3, byte [r0+r1-0x01]
+    add         r2, r3
 %endmacro
 
 ;*******************************************************************************
@@ -178,131 +173,131 @@
 ;*******************************************************************************
 ;   void WelsDecoderI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
 ;
-;	pPred must align to 16
+;   pPred must align to 16
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredH_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
-	movzx		r2,	byte [r0-1]
-	movd		xmm0,	r2d
-	pmuludq		xmm0,	[mmx_01bytes]
+    movzx       r2, byte [r0-1]
+    movd        xmm0,   r2d
+    pmuludq     xmm0,   [mmx_01bytes]
 
-	movzx		r2,	byte [r0+r1-1]
-	movd		xmm1,	r2d
-	pmuludq		xmm1,	[mmx_01bytes]
+    movzx       r2, byte [r0+r1-1]
+    movd        xmm1,   r2d
+    pmuludq     xmm1,   [mmx_01bytes]
 
-	lea			r0,	[r0+r1]
-	movzx		r2,	byte [r0+r1-1]
-	movd		xmm2,	r2d
-	pmuludq		xmm2,	[mmx_01bytes]
+    lea         r0, [r0+r1]
+    movzx       r2, byte [r0+r1-1]
+    movd        xmm2,   r2d
+    pmuludq     xmm2,   [mmx_01bytes]
 
-	movzx		r2,	byte [r0+2*r1-1]
-	movd		xmm3,	r2d
-	pmuludq		xmm3,	[mmx_01bytes]
+    movzx       r2, byte [r0+2*r1-1]
+    movd        xmm3,   r2d
+    pmuludq     xmm3,   [mmx_01bytes]
 
-	sub         r0,    r1
-	movd        [r0], xmm0
-	movd        [r0+r1], xmm1
-	lea         r0, [r0+2*r1]
-	movd        [r0], xmm2
-	movd        [r0+r1], xmm3
+    sub         r0,    r1
+    movd        [r0], xmm0
+    movd        [r0+r1], xmm1
+    lea         r0, [r0+2*r1]
+    movd        [r0], xmm2
+    movd        [r0+r1], xmm3
 
-	ret
+    ret
 
 ;*******************************************************************************
 ; void WelsDecoderI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r1, r1d
-		mov r4, r0 ; save r0 in r4
-		sub		r0,	1
-		sub		r0,	r1
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0 ; save r0 in r4
+    sub     r0, 1
+    sub     r0, r1
 
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r0]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r0 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
+    ;for H
+    pxor    xmm7,   xmm7
+    movq    xmm0,   [r0]
+    movdqa  xmm5,   [sse2_plane_dec]
+    punpcklbw xmm0, xmm7
+    pmullw  xmm0,   xmm5
+    movq    xmm1,   [r0 + 9]
+    movdqa  xmm6,   [sse2_plane_inc]
+    punpcklbw xmm1, xmm7
+    pmullw  xmm1,   xmm6
+    psubw   xmm1,   xmm0
 
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r2d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r2,	r2w
-		imul	r2,	5
-		add		r2,	32
-		sar		r2,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r2d	; xmm1 = b,b,b,b,b,b,b,b
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r2d,    xmm1        ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+    movsx   r2, r2w
+    imul    r2, 5
+    add     r2, 32
+    sar     r2, 6           ; b = (5 * H + 32) >> 6;
+    SSE2_Copy8Times xmm1, r2d   ; xmm1 = b,b,b,b,b,b,b,b
 
-		movzx	r3,	BYTE [r0+16]
-		sub	r0, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r0, r1
+    movzx   r3, BYTE [r0+16]
+    sub r0, 3
+    LOAD_COLUMN     xmm0, xmm2, xmm3, xmm4, r0, r1
 
-		add		r0,	3
-		movzx	r2,	BYTE [r0+8*r1]
-		add		r3,	r2
-		shl		r3,	4			;	a = (left[15*kiStride] + top[15]) << 4;
+    add     r0, 3
+    movzx   r2, BYTE [r0+8*r1]
+    add     r3, r2
+    shl     r3, 4           ;   a = (left[15*kiStride] + top[15]) << 4;
 
-		sub	r0, 3
-		add		r0,	r1
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r0, r1
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
+    sub r0, 3
+    add     r0, r1
+    LOAD_COLUMN     xmm7, xmm2, xmm3, xmm4, r0, r1
+    pxor    xmm4,   xmm4
+    punpckhbw xmm0, xmm4
+    pmullw  xmm0,   xmm5
+    punpckhbw xmm7, xmm4
+    pmullw  xmm7,   xmm6
+    psubw   xmm7,   xmm0
 
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r2d,   xmm7			; V
-		movsx	r2,	r2w
+    SUMW_HORIZON   xmm7,xmm0,xmm2
+    movd    r2d,   xmm7         ; V
+    movsx   r2, r2w
 
-		imul	r2,	5
-		add		r2,	32
-		sar		r2,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r2d		; xmm4 = c,c,c,c,c,c,c,c
+    imul    r2, 5
+    add     r2, 32
+    sar     r2, 6               ; c = (5 * V + 32) >> 6;
+    SSE2_Copy8Times xmm4, r2d       ; xmm4 = c,c,c,c,c,c,c,c
 
-		mov r0, r4
-		add		r3,	16
-		imul	r2,	-7
-		add		r3,	r2		; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+    mov r0, r4
+    add     r3, 16
+    imul    r2, -7
+    add     r3, r2      ; s = a + 16 + (-7)*c
+    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r2,	r2
-		movdqa	xmm5,	[sse2_plane_inc_minus]
+    xor     r2, r2
+    movdqa  xmm5,   [sse2_plane_inc_minus]
 
 get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	r1
-		inc		r2
-		cmp		r2,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    movdqa  xmm3,   xmm1
+    pmullw  xmm3,   xmm6
+    paddw   xmm3,   xmm0
+    psraw   xmm3,   5
+    packuswb xmm2,  xmm3
+    movdqa  [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, r1
+    inc     r2
+    cmp     r2, 16
+    jnz get_i16x16_luma_pred_plane_sse2_1
 
-		POP_XMM
-		pop r4
-		pop r3
-		ret
+    POP_XMM
+    pop r4
+    pop r3
+    ret
 
 
 
@@ -311,31 +306,31 @@
 ;*******************************************************************************
 
 %macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
-    lea     %1,	[%1+%2*2]
+    lea     %1, [%1+%2*2]
 
-    COPY_16_TIMES %1,	xmm0
-    movdqa  [%1],	xmm0
-    COPY_16_TIMESS %1,	xmm0,	%2
-    movdqa  [%1+%2],	xmm0
+    COPY_16_TIMES %1,   xmm0
+    movdqa  [%1],   xmm0
+    COPY_16_TIMESS %1,  xmm0,   %2
+    movdqa  [%1+%2],    xmm0
 %endmacro
 
 WELS_EXTERN WelsDecoderI16x16LumaPredH_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
-    COPY_16_TIMES r0,	xmm0
-    movdqa  [r0],		xmm0
-    COPY_16_TIMESS r0,	xmm0,	r1
-    movdqa  [r0+r1],	xmm0
+    COPY_16_TIMES r0,   xmm0
+    movdqa  [r0],       xmm0
+    COPY_16_TIMESS r0,  xmm0,   r1
+    movdqa  [r0+r1],    xmm0
 
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
-	SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+    SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
 
     ret
 
@@ -343,9 +338,9 @@
 ; void WelsDecoderI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredV_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
     sub     r0, r1
     movdqa  xmm0, [r0]
@@ -381,252 +376,252 @@
 ; void WelsDecoderIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r1, r1d
-		mov r4, r0
-		sub		r0,	1
-		sub		r0,	r1
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
+    sub     r0, 1
+    sub     r0, r1
 
-		pxor	mm7,	mm7
-		movq	mm0,	[r0]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r0 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
+    pxor    mm7,    mm7
+    movq    mm0,    [r0]
+    movq    mm5,    [sse2_plane_dec_c]
+    punpcklbw mm0,  mm7
+    pmullw  mm0,    mm5
+    movq    mm1,    [r0 + 5]
+    movq    mm6,    [sse2_plane_inc_c]
+    punpcklbw mm1,  mm7
+    pmullw  mm1,    mm6
+    psubw   mm1,    mm0
 
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r2d,	xmm1
-		movsx	r2,	r2w
-		imul	r2,	17
-		add		r2,	16
-		sar		r2,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r2d	; mm1 = b,b,b,b,b,b,b,b
+    movq2dq xmm1,   mm1
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r2d,    xmm1
+    movsx   r2, r2w
+    imul    r2, 17
+    add     r2, 16
+    sar     r2, 5           ; b = (17 * H + 16) >> 5;
+    SSE2_Copy8Times xmm1, r2d   ; mm1 = b,b,b,b,b,b,b,b
 
-		movzx	r3,	BYTE [r0+8]
-		sub	r0, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r0, r1
+    movzx   r3, BYTE [r0+8]
+    sub r0, 3
+    LOAD_COLUMN_C   mm0, mm2, mm3, mm4, r0, r1
 
-		add		r0,	3
-		movzx	r2,	BYTE [r0+4*r1]
-		add		r3,	r2
-		shl		r3,	4			; a = (left[7*kiStride] + top[7]) << 4;
+    add     r0, 3
+    movzx   r2, BYTE [r0+4*r1]
+    add     r3, r2
+    shl     r3, 4           ; a = (left[7*kiStride] + top[7]) << 4;
 
-		sub	r0, 3
-		add		r0,	r1
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r0, r1
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
+    sub r0, 3
+    add     r0, r1
+    LOAD_COLUMN_C   mm7, mm2, mm3, mm4, r0, r1
+    pxor    mm4,    mm4
+    punpckhbw mm0,  mm4
+    pmullw  mm0,    mm5
+    punpckhbw mm7,  mm4
+    pmullw  mm7,    mm6
+    psubw   mm7,    mm0
 
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r2d,    xmm7			; V
-		movsx	r2,	r2w
+    movq2dq xmm7,   mm7
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm7,xmm0,xmm2
+    movd    r2d,    xmm7            ; V
+    movsx   r2, r2w
 
-		imul	r2,	17
-		add		r2,	16
-		sar		r2,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
+    imul    r2, 17
+    add     r2, 16
+    sar     r2, 5               ; c = (17 * V + 16) >> 5;
+    SSE2_Copy8Times xmm4, r2d       ; mm4 = c,c,c,c,c,c,c,c
 
-		mov 	r0, r4
-		add		r3,	16
-		imul	r2,	-3
-		add		r3,	r2				; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+    mov     r0, r4
+    add     r3, 16
+    imul    r2, -3
+    add     r3, r2              ; s = a + 16 + (-3)*c
+    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r2,	r2
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
+    xor     r2, r2
+    movdqa  xmm5,   [sse2_plane_mul_b_c]
 
 get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	r1
-		inc		r2
-		cmp		r2,	8
-		jnz get_i_chroma_pred_plane_sse2_1
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    packuswb xmm2,  xmm2
+    movq    [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, r1
+    inc     r2
+    cmp     r2, 8
+    jnz get_i_chroma_pred_plane_sse2_1
 
-		POP_XMM
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
+    POP_XMM
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 ;*******************************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pPred[7] = ([6]+[0]*2+[1]+2)/4
+;   0 |1 |2 |3 |4 |
+;   6 |7 |8 |9 |10|
+;   11|12|13|14|15|
+;   16|17|18|19|20|
+;   21|22|23|24|25|
+;   7 is the start pixel of current 4x4 block
+;   pPred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void WelsDecoderI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDR_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	movq        mm1,[r2+r1-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r2-8]			;get value of 6 mm2[8] = 6
-	sub		r2, r1			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r2-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r2]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea 		r2,[r2+r1*2-8h]		;set eax point to 12
-	movq        mm4,[r2+r1]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r2+r1*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+    movq        mm1,[r2+r1-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+    movq        mm2,[r2-8]          ;get value of 6 mm2[8] = 6
+    sub     r2, r1          ;mov eax to above line of current block(postion of 1)
+    punpckhbw   mm2,[r2-8]          ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+    movd        mm3,[r2]            ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+    punpckhwd   mm1,mm2             ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+    psllq       mm3,18h             ;mm3[5]=[1]
+    psrlq       mm1,28h             ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    por         mm3,mm1             ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+    movq        mm1,mm3             ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    lea         r2,[r2+r1*2-8h]     ;set eax point to 12
+    movq        mm4,[r2+r1]     ;get value of 16, mm4[8]=[16]
+    psllq       mm3,8               ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[16]
+    por         mm3,mm4             ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+    movq        mm2,mm3             ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+    movq        mm4,[r2+r1*2]       ;mm4[8]=[21]
+    psllq       mm3,8               ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[21]
+    por         mm3,mm4             ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+    movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+    pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
+    pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
+    pand        mm1,[mmx_01bytes]   ;set the odd bit
+    psubusb     mm3,mm1             ;decrease 1 from odd bytes
+    pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
-	lea         r0,[r0+r1]
-	movd        [r0+2*r1],mm2
-	sub         r0,r1
-	psrlq       mm2,8
-	movd        [r0+2*r1],mm2
-	psrlq       mm2,8
-	movd        [r0+r1],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
+    lea         r0,[r0+r1]
+    movd        [r0+2*r1],mm2
+    sub         r0,r1
+    psrlq       mm2,8
+    movd        [r0+2*r1],mm2
+    psrlq       mm2,8
+    movd        [r0+r1],mm2
+    psrlq       mm2,8
+    movd        [r0],mm2
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
 ;   copy 8 pixel of 8 line from left
 ;*******************************************************************************
 %macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3-8]
+    psrlq       %1,     38h
 
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r1-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3+r1-8]
+    psrlq       %1,     38h
 
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 WELS_EXTERN WelsDecoderIChromaPredH_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	movq		mm0,	[r2-8]
-	psrlq		mm0,	38h
+    movq        mm0,    [r2-8]
+    psrlq       mm0,    38h
 
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
+    pmullw      mm0,        [mmx_01bytes]
+    pshufw      mm0,    mm0,    0
+    movq        [r0],   mm0
 
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+    lea         r2, [r2+r1*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
 
-	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    lea         r0, [r0+2*r1]
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+    lea         r2, [r2+r1*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
 
-	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    lea         r0, [r0+2*r1]
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	lea			r2, [r2+r1*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r2, r0+2*r1
+    lea         r2, [r2+r1*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
 
-    	lea         r0, [r0+2*r1]
-	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
+    lea         r0, [r0+2*r1]
+    MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
 
-	WELSEMMS
-	ret
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
 ;   copy 8 pixels from top 8 pixels
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredV_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
 
-	sub			r0,		r1
-	movq		mm0,		[r0]
+    sub         r0,     r1
+    movq        mm0,        [r0]
 
-	movq		[r0+r1],		mm0
-	movq		[r0+2*r1],	mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
-	lea         r0, [r0+2*r1]
-	movq		[r0+r1],      mm0
-	movq		[r0+2*r1],    mm0
+    movq        [r0+r1],        mm0
+    movq        [r0+2*r1],  mm0
+    lea         r0, [r0+2*r1]
+    movq        [r0+r1],      mm0
+    movq        [r0+2*r1],    mm0
+    lea         r0, [r0+2*r1]
+    movq        [r0+r1],      mm0
+    movq        [r0+2*r1],    mm0
+    lea         r0, [r0+2*r1]
+    movq        [r0+r1],      mm0
+    movq        [r0+2*r1],    mm0
 
-	WELSEMMS
-	ret
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
+;   |a |b |c |d |
+;   |e |f |a |b |
+;   |g |h |e |f |
+;   |i |j |g |h |
 
 ;   a = (1 + lt + l0)>>1
 ;   e = (1 + l0 + l1)>>1
@@ -645,73 +640,73 @@
 ;   void WelsDecoderI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHD_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub         r2, r1
+    movd        mm0, [r2-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+    psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
 
-	movd        mm1, [r2+2*r1-4]
-	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r2, [r2+2*r1]
-	movd        mm2, [r2+2*r1-4]
-	punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+    movd        mm1, [r2+2*r1-4]
+    punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r2, [r2+2*r1]
+    movd        mm2, [r2+2*r1-4]
+    punpcklbw   mm2, [r2+r1-4]        ; mm2[7] = l2, mm2[6] = l3
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+    psrlq       mm2, 20h
+    pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
 
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
+    movq        mm1, mm0
+    psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+    movq        mm2, mm0
+    psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+    movq        mm3, mm2
+    movq        mm4, mm1
+    pavgb       mm1, mm0
 
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+    pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
 
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+    movq        mm4, mm0
+    pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+    punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
 
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+    psrlq       mm2, 20h
+    psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+    movq        mm4, mm3
+    psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+    pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
 
-	movd        [r0], mm2
-	lea         r0, [r0+r1]
-	movd        [r0+2*r1], mm3
-	sub         r0, r1
-	psrlq       mm3, 10h
-	movd        [r0+2*r1], mm3
-	psrlq       mm3, 10h
-	movd        [r0+r1], mm3
-	WELSEMMS
-	ret
+    movd        [r0], mm2
+    lea         r0, [r0+r1]
+    movd        [r0+2*r1], mm3
+    sub         r0, r1
+    psrlq       mm3, 10h
+    movd        [r0+2*r1], mm3
+    psrlq       mm3, 10h
+    movd        [r0+r1], mm3
+    WELSEMMS
+    ret
 
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
+;   |a |b |c |d |
+;   |c |d |e |f |
+;   |e |f |g |g |
+;   |g |g |g |g |
 
 ;   a = (1 + l0 + l1)>>1
 ;   c = (1 + l1 + l2)>>1
@@ -727,74 +722,74 @@
 ;   void WelsDecoderI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredHU_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	movd        mm0, [r2-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r2, [r2+2*r1]
-	movd        mm2, [r2-4]            ; mm2[3] = l2
-	movd        mm4, [r2+r1-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+    movd        mm0, [r2-4]            ; mm0[3] = l0
+    punpcklbw   mm0, [r2+r1-4]        ; mm0[7] = l1, mm0[6] = l0
+    lea         r2, [r2+2*r1]
+    movd        mm2, [r2-4]            ; mm2[3] = l2
+    movd        mm4, [r2+r1-4]        ; mm4[3] = l3
+    punpcklbw   mm2, mm4
+    punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
 
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+    psrlq       mm4, 18h
+    psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+    psrlq       mm0, 8h
+    pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+    movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+    pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+    movq        mm5, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
+    pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm5, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+    pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
 
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
 
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+    punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
 
-	psrlq       mm4, 20h
-	lea         r0, [r0+r1]
-	movd        [r0+2*r1], mm4
+    psrlq       mm4, 20h
+    lea         r0, [r0+r1]
+    movd        [r0+2*r1], mm4
 
-	sub         r0, r1
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+r1], mm1
-	psrlq       mm1, 10h
-	movd        [r0+2*r1], mm1
-	WELSEMMS
-	ret
+    sub         r0, r1
+    movd        [r0], mm1
+    psrlq       mm1, 10h
+    movd        [r0+r1], mm1
+    psrlq       mm1, 10h
+    movd        [r0+2*r1], mm1
+    WELSEMMS
+    ret
 
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   l3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |i |a |b |c |
+;   |j |e |f |g |
 
 ;   a = (1 + lt + t0)>>1
 ;   b = (1 + t0 + t1)>>1
@@ -812,77 +807,77 @@
 ;   void WelsDecoderI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVR_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub         r2, r1
+    movq        mm0, [r2-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+    psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
 
-	movd        mm1, [r2+2*r1-4]
-	punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r2, [r2+2*r1]
-	movq        mm2, [r2+r1-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+    movd        mm1, [r2+2*r1-4]
+    punpcklbw   mm1, [r2+r1-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r2, [r2+2*r1]
+    movq        mm2, [r2+r1-8]        ; mm2[7] = l2
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+    psrlq       mm2, 28h
+    pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+    movq        mm3, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
+    pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
+    movq        mm3, mm0
+    psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+    movq        mm2, mm3
 
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
+    psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+    movd        [r0], mm1
 
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+r1], mm2
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+    movd        [r0+r1], mm2
 
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+    movq        mm4, mm3
+    psllq       mm4, 20h
+    psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
 
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+    movq        mm5, mm3
+    psllq       mm5, 28h
+    psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
 
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+2*r1], mm4
+    psllq       mm1, 8h
+    pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+    movd        [r0+2*r1], mm4
 
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm5
-	WELSEMMS
-	ret
+    psllq       mm2, 8h
+    pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+    lea         r0, [r0+2*r1]
+    movd        [r0+r1], mm5
+    WELSEMMS
+    ret
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
+;   |a |b |c |d |
+;   |b |c |d |e |
+;   |c |d |e |f |
+;   |d |e |f |g |
 
 ;   a = (2 + t0 + t2 + (t1<<1))>>2
 ;   b = (2 + t1 + t3 + (t2<<1))>>2
@@ -898,56 +893,56 @@
 ;   void WelsDecoderI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredDDL_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub         r2, r1
-	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub         r2, r1
+    movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+    movq        mm3, mm0
+    psrlq       mm3, 38h
+    psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
 
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+    psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
 
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
+    movq        mm3, mm1
+    pavgb       mm1, mm2
+    pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+    pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
 
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+r1], mm0
-	psrlq       mm0, 8h
-	movd        [r0+2*r1], mm0
-	psrlq       mm0, 8h
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm0
-	WELSEMMS
-	ret
+    psrlq       mm0, 8h
+    movd        [r0], mm0
+    psrlq       mm0, 8h
+    movd        [r0+r1], mm0
+    psrlq       mm0, 8h
+    movd        [r0+2*r1], mm0
+    psrlq       mm0, 8h
+    lea         r0, [r0+2*r1]
+    movd        [r0+r1], mm0
+    WELSEMMS
+    ret
 
 
 ;*******************************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |b |c |d |i |
+;   |f |g |h |j |
 
 ;   a = (1 + t0 + t1)>>1
 ;   b = (1 + t1 + t2)>>1
@@ -966,40 +961,40 @@
 ;   void WelsDecoderI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI4x4LumaPredVL_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
 
-	sub         r2, r1
-	movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    sub         r2, r1
+    movq        mm0, [r2]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+    psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+    psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+    movq        mm3, mm1
+    pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
 
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
+    movq        mm4, mm2
+    pavgb       mm2, mm0
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+    pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
 
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+2*r1], mm3
+    movd        [r0], mm3
+    psrlq       mm3, 8h
+    movd        [r0+2*r1], mm3
 
-	movd        [r0+r1], mm2
-	psrlq       mm2, 8h
-	lea         r0, [r0+2*r1]
-	movd        [r0+r1], mm2
-	WELSEMMS
-	ret
+    movd        [r0+r1], mm2
+    psrlq       mm2, 8h
+    lea         r0, [r0+2*r1]
+    movd        [r0+r1], mm2
+    WELSEMMS
+    ret
 
 ;*******************************************************************************
 ;
@@ -1006,93 +1001,93 @@
 ;   void WelsDecoderIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDc_sse2
-	push 	r3
-	push 	r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
+    push    r3
+    push    r4
+    %assign push_num 2
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
 
-	sub         r0, r1
-	movq        mm0, [r0]
+    sub         r0, r1
+    movq        mm0, [r0]
 
-	movzx		r2, byte [r0+r1-0x01] ; l1
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l2
-	add			r2, r3
-	movzx		r3, byte [r0+r1-0x01] ; l3
-	add			r2, r3
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l4
-	add			r2, r3
-	movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
+    movzx       r2, byte [r0+r1-0x01] ; l1
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l2
+    add         r2, r3
+    movzx       r3, byte [r0+r1-0x01] ; l3
+    add         r2, r3
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l4
+    add         r2, r3
+    movd        mm1, r2d                 ; mm1 = l1+l2+l3+l4
 
-	movzx		r2, byte [r0+r1-0x01] ; l5
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l6
-	add			r2, r3
-	movzx		r3, byte [r0+r1-0x01] ; l7
-	add			r2, r3
-	lea         r0, [r0+2*r1]
-	movzx		r3, byte [r0-0x01]     ; l8
-	add			r2, r3
-	movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
+    movzx       r2, byte [r0+r1-0x01] ; l5
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l6
+    add         r2, r3
+    movzx       r3, byte [r0+r1-0x01] ; l7
+    add         r2, r3
+    lea         r0, [r0+2*r1]
+    movzx       r3, byte [r0-0x01]     ; l8
+    add         r2, r3
+    movd        mm2, r2d                 ; mm2 = l5+l6+l7+l8
 
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+    movq        mm3, mm0
+    psrlq       mm0, 0x20
+    psllq       mm3, 0x20
+    psrlq       mm3, 0x20
+    pxor        mm4, mm4
+    psadbw      mm0, mm4
+    psadbw      mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
 
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+    paddq       mm3, mm1
+    movq        mm1, mm2
+    paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-	movq        mm4, [mmx_0x02]
+    movq        mm4, [mmx_0x02]
 
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
+    paddq       mm0, mm4
+    psrlq       mm0, 0x02
 
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
+    paddq       mm2, mm4
+    psrlq       mm2, 0x02
 
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
+    paddq       mm3, mm4
+    paddq       mm3, mm4
+    psrlq       mm3, 0x03
 
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
+    paddq       mm1, mm4
+    paddq       mm1, mm4
+    psrlq       mm1, 0x03
 
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
+    pmuludq     mm0, [mmx_01bytes]
+    pmuludq     mm3, [mmx_01bytes]
+    psllq       mm0, 0x20
+    pxor        mm0, mm3                 ; mm0 = m_up
 
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
+    pmuludq     mm2, [mmx_01bytes]
+    pmuludq     mm1, [mmx_01bytes]
+    psllq       mm1, 0x20
+    pxor        mm1, mm2                 ; mm2 = m_down
 
-	movq        [r4],       mm0
-	movq        [r4+r1],   mm0
-	movq        [r4+2*r1], mm0
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm0
+    movq        [r4],       mm0
+    movq        [r4+r1],   mm0
+    movq        [r4+2*r1], mm0
+    lea         r4, [r4+2*r1]
+    movq        [r4+r1],   mm0
 
-	movq        [r4+2*r1], mm1
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm1
-	movq        [r4+2*r1], mm1
-	lea         r4, [r4+2*r1]
-	movq        [r4+r1],   mm1
+    movq        [r4+2*r1], mm1
+    lea         r4, [r4+2*r1]
+    movq        [r4+r1],   mm1
+    movq        [r4+2*r1], mm1
+    lea         r4, [r4+2*r1]
+    movq        [r4+r1],   mm1
 
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 
 
@@ -1101,75 +1096,75 @@
 ;   void WelsDecoderI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDc_sse2
-	push 	r3
-	push 	r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-	sub         r0, r1
-	movdqa      xmm0, [r0]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
+    push    r3
+    push    r4
+    %assign push_num 2
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
+    sub         r0, r1
+    movdqa      xmm0, [r0]             ; read one row
+    pxor        xmm1, xmm1
+    psadbw      xmm0, xmm1
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 0x08
+    pslldq      xmm0, 0x08
+    psrldq      xmm0, 0x08
+    paddw       xmm0, xmm1
 
-	movzx		r2, byte [r0+r1-0x01]
-	movzx		r3, byte [r0+2*r1-0x01]
-	add		r2, r3
-	lea    		r0, [r0+r1]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r2, 0x10
-	movd        xmm1, r2d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
+    movzx       r2, byte [r0+r1-0x01]
+    movzx       r3, byte [r0+2*r1-0x01]
+    add     r2, r3
+    lea         r0, [r0+r1]
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    add         r2, 0x10
+    movd        xmm1, r2d
+    paddw       xmm0, xmm1
+    psrld       xmm0, 0x05
+    pmuludq     xmm0, [mmx_01bytes]
+    pshufd      xmm0, xmm0, 0
 
-	movdqa      [r4],       xmm0
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4],       xmm0
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
-	movdqa      [r4+2*r1], xmm0
-	lea         r4,         [r4+2*r1]
+    movdqa      [r4+r1],   xmm0
+    movdqa      [r4+2*r1], xmm0
+    lea         r4,         [r4+2*r1]
 
-	movdqa      [r4+r1],   xmm0
+    movdqa      [r4+r1],   xmm0
 
-	pop r4
-	pop r3
+    pop r4
+    pop r3
 
-	ret
+    ret
 
 ;*******************************************************************************
 ; for intra prediction as follows, 11/19/2010
@@ -1176,239 +1171,239 @@
 ;*******************************************************************************
 
 ;*******************************************************************************
-;	void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderI16x16LumaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcTop_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub r2, r1
-	movdqa xmm0, [r2]		; pPred-kiStride, top line
-	pxor xmm7, xmm7
-	psadbw xmm0, xmm7
-	movdqa xmm1, xmm0
-	psrldq xmm1, 8
-	paddw  xmm0, xmm1
-	xor r2, r2
-	movd r2d, xmm0
-	;movdqa xmm1, xmm0
-	;punpcklbw xmm0, xmm7
-	;punpckhbw xmm1, xmm7
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub r2, r1
+    movdqa xmm0, [r2]       ; pPred-kiStride, top line
+    pxor xmm7, xmm7
+    psadbw xmm0, xmm7
+    movdqa xmm1, xmm0
+    psrldq xmm1, 8
+    paddw  xmm0, xmm1
+    xor r2, r2
+    movd r2d, xmm0
+    ;movdqa xmm1, xmm0
+    ;punpcklbw xmm0, xmm7
+    ;punpckhbw xmm1, xmm7
 
-	;paddw xmm0, xmm1			; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
-	;pshufd xmm1, xmm0, 04eh		; 01001110, w3w2w1w0,w7w6w5w4
-	;paddw xmm0, xmm1			; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
-	;pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
-	;paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
-	;pshuflw xmm1, xmm0, 0b1h	; 10110001
-	;paddw xmm0, xmm1			; sum in word unit (x8)
-	;xor r3, r3
-	;movd r3d, xmm0
-	;and edx, 0ffffh
+    ;paddw xmm0, xmm1           ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+    ;pshufd xmm1, xmm0, 04eh        ; 01001110, w3w2w1w0,w7w6w5w4
+    ;paddw xmm0, xmm1           ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+    ;pshufd xmm1, xmm0, 0b1h        ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+    ;paddw xmm0, xmm1           ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+    ;pshuflw xmm1, xmm0, 0b1h   ; 10110001
+    ;paddw xmm0, xmm1           ; sum in word unit (x8)
+    ;xor r3, r3
+    ;movd r3d, xmm0
+    ;and edx, 0ffffh
 
-	add r2, 8
-	sar r2, 4
-	SSE2_Copy16Times xmm1, r2d
-	;mov dh, dl
-	;mov r2, edx
-	;shl r2, 010h
-	;or edx, r2
-	;movd xmm1, edx
-	;pshufd xmm0, xmm1, 00h
-	;movdqa xmm1, xmm0
-	movdqa xmm0, xmm1
-	lea r2, [2*r1+r1]		; 3*kiStride
+    add r2, 8
+    sar r2, 4
+    SSE2_Copy16Times xmm1, r2d
+    ;mov dh, dl
+    ;mov r2, edx
+    ;shl r2, 010h
+    ;or edx, r2
+    ;movd xmm1, edx
+    ;pshufd xmm0, xmm1, 00h
+    ;movdqa xmm1, xmm0
+    movdqa xmm0, xmm1
+    lea r2, [2*r1+r1]       ; 3*kiStride
 
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	POP_XMM
-	ret
+    POP_XMM
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderI16x16LumaPredDcNA_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderI16x16LumaPredDcNA_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	lea r2, [2*r1+r1]		; 3*kiStride
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    lea r2, [2*r1+r1]       ; 3*kiStride
 
-	movdqa xmm0, [sse2_dc_0x80]
-	movdqa xmm1, xmm0
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm0
-	movdqa [r0+r2], xmm1
+    movdqa xmm0, [sse2_dc_0x80]
+    movdqa xmm1, xmm0
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
+    lea r0, [r0+4*r1]
+    movdqa [r0], xmm0
+    movdqa [r0+r1], xmm1
+    movdqa [r0+2*r1], xmm0
+    movdqa [r0+r2], xmm1
 
-	ret
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcLeft_mmx
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	mov r4, r0
-	; for left
-	dec r0
-	xor r2, r2
-	xor r3, r3
-	movzx r2, byte [r0]
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	lea r0, [r0+2*r1]
-	movzx r3, byte [r0]
-	add r2, r3
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	add r2, 02h
-	sar r2, 02h
-	;SSE2_Copy16Times mm0, r2d
-	mov r3, r2
-	sal r3, 8
-	or r2, r3
-	movd mm1, r2d
-	pshufw mm0, mm1, 00h
-	;mov bh, bl
-	;movd mm1, ebx
-	;pshufw mm0, mm1, 00h	; up64
-	movq mm1, mm0
-	xor r2, r2
-	lea r0, [r0+2*r1]
-	movzx r2, byte [r0]
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	lea r0, [r0+2*r1]
-	movzx r3, byte [r0]
-	add r2, r3
-	movzx r3, byte [r0+r1]
-	add r2, r3
-	add r2, 02h
-	sar r2, 02h
-	mov r3, r2
-	sal r3, 8
-	or r2, r3
-	movd mm3, r2d
-	pshufw mm2, mm3, 00h
-	;mov bh, bl
-	;movd mm3, ebx
-	;pshufw mm2, mm3, 00h	; down64
-	;SSE2_Copy16Times mm2, r2d
-	movq mm3, mm2
-	lea r2, [2*r1+r1]
-	movq [r4], mm0
-	movq [r4+r1], mm1
-	movq [r4+2*r1], mm0
-	movq [r4+r2], mm1
-	lea r4, [r4+4*r1]
-	movq [r4], mm2
-	movq [r4+r1], mm3
-	movq [r4+2*r1], mm2
-	movq [r4+r2], mm3
-	pop r4
-	pop r3
-	emms
-	ret
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    mov r4, r0
+    ; for left
+    dec r0
+    xor r2, r2
+    xor r3, r3
+    movzx r2, byte [r0]
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    lea r0, [r0+2*r1]
+    movzx r3, byte [r0]
+    add r2, r3
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    add r2, 02h
+    sar r2, 02h
+    ;SSE2_Copy16Times mm0, r2d
+    mov r3, r2
+    sal r3, 8
+    or r2, r3
+    movd mm1, r2d
+    pshufw mm0, mm1, 00h
+    ;mov bh, bl
+    ;movd mm1, ebx
+    ;pshufw mm0, mm1, 00h   ; up64
+    movq mm1, mm0
+    xor r2, r2
+    lea r0, [r0+2*r1]
+    movzx r2, byte [r0]
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    lea r0, [r0+2*r1]
+    movzx r3, byte [r0]
+    add r2, r3
+    movzx r3, byte [r0+r1]
+    add r2, r3
+    add r2, 02h
+    sar r2, 02h
+    mov r3, r2
+    sal r3, 8
+    or r2, r3
+    movd mm3, r2d
+    pshufw mm2, mm3, 00h
+    ;mov bh, bl
+    ;movd mm3, ebx
+    ;pshufw mm2, mm3, 00h   ; down64
+    ;SSE2_Copy16Times mm2, r2d
+    movq mm3, mm2
+    lea r2, [2*r1+r1]
+    movq [r4], mm0
+    movq [r4+r1], mm1
+    movq [r4+2*r1], mm0
+    movq [r4+r2], mm1
+    lea r4, [r4+4*r1]
+    movq [r4], mm2
+    movq [r4+r1], mm3
+    movq [r4+2*r1], mm2
+    movq [r4+r2], mm3
+    pop r4
+    pop r3
+    emms
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredDcTop_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcTop_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	mov r2, r0
-	sub r2, r1
-	movq xmm0, [r2]		; top: 8x1 pixels
-	pxor xmm7, xmm7
-	punpcklbw xmm0, xmm7		; ext 8x2 words
-	pshufd xmm1, xmm0, 0B1h		; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
-	paddw xmm0, xmm1			; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
-	movdqa xmm1, xmm0
-	pshuflw xmm2, xmm0, 0B1h	; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
-	pshufhw xmm3, xmm1, 0B1h	; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
-	paddw xmm0, xmm2			; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
-	paddw xmm1, xmm3			; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
-	punpckhqdq xmm1, xmm7
-	punpcklqdq xmm0, xmm1		; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
-	movdqa xmm6, [sse2_wd_0x02]
-	paddw xmm0, xmm6
-	psraw xmm0, 02h
-	packuswb xmm0, xmm7
-	lea r2, [2*r1+r1]
-	movq [r0], xmm0
-	movq [r0+r1], xmm0
-	movq [r0+2*r1], xmm0
-	movq [r0+r2], xmm0
-	lea r0, [r0+4*r1]
-	movq [r0], xmm0
-	movq [r0+r1], xmm0
-	movq [r0+2*r1], xmm0
-	movq [r0+r2], xmm0
-	POP_XMM
-	ret
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    mov r2, r0
+    sub r2, r1
+    movq xmm0, [r2]     ; top: 8x1 pixels
+    pxor xmm7, xmm7
+    punpcklbw xmm0, xmm7        ; ext 8x2 words
+    pshufd xmm1, xmm0, 0B1h     ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
+    paddw xmm0, xmm1            ; w5+7 w4+6 w5+7 w4+6 w1+3 w0+2 w1+3 w0+2
+    movdqa xmm1, xmm0
+    pshuflw xmm2, xmm0, 0B1h    ; 10110001 B, .. w0+2 w1+3 w0+2 w1+3
+    pshufhw xmm3, xmm1, 0B1h    ; 10110001 B, w4+6 w5+7 w4+6 w5+7 ..
+    paddw xmm0, xmm2            ; .. w0+..+3 w0+..+3 w0+..+3 w0+..+3
+    paddw xmm1, xmm3            ; w4+..+7 w4+..+7 w4+..+7 w4+..+7 ..
+    punpckhqdq xmm1, xmm7
+    punpcklqdq xmm0, xmm1       ; sum1 sum1 sum1 sum1 sum0 sum0 sum0 sum0
+    movdqa xmm6, [sse2_wd_0x02]
+    paddw xmm0, xmm6
+    psraw xmm0, 02h
+    packuswb xmm0, xmm7
+    lea r2, [2*r1+r1]
+    movq [r0], xmm0
+    movq [r0+r1], xmm0
+    movq [r0+2*r1], xmm0
+    movq [r0+r2], xmm0
+    lea r0, [r0+4*r1]
+    movq [r0], xmm0
+    movq [r0+r1], xmm0
+    movq [r0+2*r1], xmm0
+    movq [r0+r2], xmm0
+    POP_XMM
+    ret
 
 ;*******************************************************************************
-;	void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
+;   void WelsDecoderIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsDecoderIChromaPredDcNA_mmx
-	%assign push_num 0
-	LOAD_2_PARA
-	SIGN_EXTENSION r1, r1d
-	lea r2, [2*r1+r1]
-	movq mm0, [sse2_dc_0x80]
-	movq mm1, mm0
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	movq [r0+2*r1], mm0
-	movq [r0+r2], mm1
-	lea r0, [r0+4*r1]
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	movq [r0+2*r1], mm0
-	movq [r0+r2], mm1
-	emms
-	ret
+    %assign push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    lea r2, [2*r1+r1]
+    movq mm0, [sse2_dc_0x80]
+    movq mm1, mm0
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    movq [r0+2*r1], mm0
+    movq [r0+r2], mm1
+    lea r0, [r0+4*r1]
+    movq [r0], mm0
+    movq [r0+r1], mm1
+    movq [r0+2*r1], mm0
+    movq [r0+r2], mm1
+    emms
+    ret
 
--- a/codec/encoder/core/arm/intra_pred_neon.S
+++ b/codec/encoder/core/arm/intra_pred_neon.S
@@ -38,107 +38,107 @@
 #ifdef __APPLE__
 //Global macro
 .macro GET_8BYTE_DATA
-	vld1.8 {$0[0]}, [$1], $2
-	vld1.8 {$0[1]}, [$1], $2
-	vld1.8 {$0[2]}, [$1], $2
-	vld1.8 {$0[3]}, [$1], $2
-	vld1.8 {$0[4]}, [$1], $2
-	vld1.8 {$0[5]}, [$1], $2
-	vld1.8 {$0[6]}, [$1], $2
-	vld1.8 {$0[7]}, [$1], $2
+    vld1.8 {$0[0]}, [$1], $2
+    vld1.8 {$0[1]}, [$1], $2
+    vld1.8 {$0[2]}, [$1], $2
+    vld1.8 {$0[3]}, [$1], $2
+    vld1.8 {$0[4]}, [$1], $2
+    vld1.8 {$0[5]}, [$1], $2
+    vld1.8 {$0[6]}, [$1], $2
+    vld1.8 {$0[7]}, [$1], $2
 .endm
 #else
 //Global macro
 .macro GET_8BYTE_DATA arg0, arg1, arg2
-	vld1.8 {\arg0[0]}, [\arg1], \arg2
-	vld1.8 {\arg0[1]}, [\arg1], \arg2
-	vld1.8 {\arg0[2]}, [\arg1], \arg2
-	vld1.8 {\arg0[3]}, [\arg1], \arg2
-	vld1.8 {\arg0[4]}, [\arg1], \arg2
-	vld1.8 {\arg0[5]}, [\arg1], \arg2
-	vld1.8 {\arg0[6]}, [\arg1], \arg2
-	vld1.8 {\arg0[7]}, [\arg1], \arg2
+    vld1.8 {\arg0[0]}, [\arg1], \arg2
+    vld1.8 {\arg0[1]}, [\arg1], \arg2
+    vld1.8 {\arg0[2]}, [\arg1], \arg2
+    vld1.8 {\arg0[3]}, [\arg1], \arg2
+    vld1.8 {\arg0[4]}, [\arg1], \arg2
+    vld1.8 {\arg0[5]}, [\arg1], \arg2
+    vld1.8 {\arg0[6]}, [\arg1], \arg2
+    vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
-	//Get the top line data to 'q0'
-	sub  r3, r1, r2
-	vldm r3, {d0, d1}
+    //Get the top line data to 'q0'
+    sub  r3, r1, r2
+    vldm r3, {d0, d1}
 
-	//mov  r2, #16
-	mov  r3, #4
-	//Set the top line to the each line of MB(16*16)
+    //mov  r2, #16
+    mov  r3, #4
+    //Set the top line to the each line of MB(16*16)
 loop_0_get_i16x16_luma_pred_v:
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_v
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_v
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
     //stmdb sp!, {r4, lr}
-	sub  r1, r1, #1
-	mov  r3, #4
+    sub  r1, r1, #1
+    mov  r3, #4
 loop_0_get_i16x16_luma_pred_h:
-	//Get one byte data from left side
-	vld1.8 {d0[],d1[]}, [r1], r2
-	vld1.8 {d2[],d3[]}, [r1], r2
-	vld1.8 {d4[],d5[]}, [r1], r2
-	vld1.8 {d6[],d7[]}, [r1], r2
+    //Get one byte data from left side
+    vld1.8 {d0[],d1[]}, [r1], r2
+    vld1.8 {d2[],d3[]}, [r1], r2
+    vld1.8 {d4[],d5[]}, [r1], r2
+    vld1.8 {d6[],d7[]}, [r1], r2
 
-	//Set the line of MB using the left side byte data
-	vst1.8 {d0,d1}, [r0]!
-	//add r0, #16
-	vst1.8 {d2,d3}, [r0]!
-	//add r0, #16
-	vst1.8 {d4,d5}, [r0]!
-	//add r0, #16
-	vst1.8 {d6,d7}, [r0]!
-	//add r0, #16
+    //Set the line of MB using the left side byte data
+    vst1.8 {d0,d1}, [r0]!
+    //add r0, #16
+    vst1.8 {d2,d3}, [r0]!
+    //add r0, #16
+    vst1.8 {d4,d5}, [r0]!
+    //add r0, #16
+    vst1.8 {d6,d7}, [r0]!
+    //add r0, #16
 
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_h
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_h
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the left vertical line data
-	sub r3, r1, #1
-	GET_8BYTE_DATA d0, r3, r2
-	GET_8BYTE_DATA d1, r3, r2
+    //stmdb sp!, { r2-r5, lr}
+    //Get the left vertical line data
+    sub r3, r1, #1
+    GET_8BYTE_DATA d0, r3, r2
+    GET_8BYTE_DATA d1, r3, r2
 
-	//Get the top horizontal line data
-	sub  r3, r1, r2
-	vldm r3, {d2, d3}
+    //Get the top horizontal line data
+    sub  r3, r1, r2
+    vldm r3, {d2, d3}
 
-	//Calculate the sum of top horizontal line data and vertical line data
-	vpaddl.u8 q0, q0
-	vpaddl.u8 q1, q1
-	vadd.u16  q0, q0, q1
-	vadd.u16  d0, d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the sum of top horizontal line data and vertical line data
+    vpaddl.u8 q0, q0
+    vpaddl.u8 q1, q1
+    vadd.u16  q0, q0, q1
+    vadd.u16  d0, d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, d0, #5
-	vdup.8     q0, d0[0]
+    //Calculate the mean value
+    vrshr.u16  d0, d0, #5
+    vdup.8     q0, d0[0]
 
-	//Set the mean value to the all of member of MB
-	mov  r3, #4
+    //Set the mean value to the all of member of MB
+    mov  r3, #4
 loop_0_get_i16x16_luma_pred_dc_both:
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	vst1.8 {d0,d1}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_dc_both
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    vst1.8 {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_dc_both
 
 WELS_ASM_FUNC_END
 
@@ -151,383 +151,383 @@
 
 
 WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
-	//stmdb sp!, { r4, lr}
+    //stmdb sp!, { r4, lr}
 
-	//Load the table {(8,7,6,5,4,3,2,1) * 5}
-	adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
-	vldr    d0, [r3]
+    //Load the table {(8,7,6,5,4,3,2,1) * 5}
+    adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
+    vldr    d0, [r3]
 
-	//Pack the top[-1] ~ top[6] to d1
-	sub       r3,  r1, r2
-	sub       r1,  r3, #1
-	vld1.8    d1, [r1]
+    //Pack the top[-1] ~ top[6] to d1
+    sub       r3,  r1, r2
+    sub       r1,  r3, #1
+    vld1.8    d1, [r1]
 
-	//Pack the top[8] ~ top[15] to d2
-	add       r1, #9
-	vld1.8    d2, [r1]
+    //Pack the top[8] ~ top[15] to d2
+    add       r1, #9
+    vld1.8    d2, [r1]
 
-	//Save the top[15] to d6 for next step
-	vdup.u8   d6,   d2[7]
+    //Save the top[15] to d6 for next step
+    vdup.u8   d6,   d2[7]
 
-	//Get and pack left[-1] ~ left[6] to d4
-	sub       r1,  r3, #1
-	GET_8BYTE_DATA d4, r1, r2
+    //Get and pack left[-1] ~ left[6] to d4
+    sub       r1,  r3, #1
+    GET_8BYTE_DATA d4, r1, r2
 
-	//Get and pack left[8] ~ left[15] to d3
-	add       r1,  r2
-	GET_8BYTE_DATA d3, r1, r2
+    //Get and pack left[8] ~ left[15] to d3
+    add       r1,  r2
+    GET_8BYTE_DATA d3, r1, r2
 
-	//Save the left[15] to d7 for next step
-	vdup.u8   d7,   d3[7]
+    //Save the left[15] to d7 for next step
+    vdup.u8   d7,   d3[7]
 
-	//revert the sequence of d2,d3
-	vrev64.8   q1, q1
+    //revert the sequence of d2,d3
+    vrev64.8   q1, q1
 
-	vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
-	vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
+    vsubl.u8   q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
+    vsubl.u8   q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
 
 
-	vmovl.u8   q0, d0
-	vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
-	vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
+    vmovl.u8   q0, d0
+    vmul.s16   q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
+    vmul.s16   q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
 
-	//Calculate the sum of items of q1, q2
-	vpadd.s16  d0, d2, d3
-	vpadd.s16  d1, d4, d5
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
+    //Calculate the sum of items of q1, q2
+    vpadd.s16  d0, d2, d3
+    vpadd.s16  d1, d4, d5
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
 
-	//Get the value of 'b', 'c' and extend to q1, q2.
-	vrshr.s64  q0, #6
-	vdup.s16   q1, d0[0]
-	vdup.s16   q2, d1[0]
+    //Get the value of 'b', 'c' and extend to q1, q2.
+    vrshr.s64  q0, #6
+    vdup.s16   q1, d0[0]
+    vdup.s16   q2, d1[0]
 
-	//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
-	adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
-	vld1.32   {d0}, [r3]
+    //Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
+    adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
+    vld1.32   {d0}, [r3]
 
-	//Get the value of 'a' and save to q3
-	vaddl.u8  q3, d6, d7
-	vshl.u16  q3, #4
+    //Get the value of 'a' and save to q3
+    vaddl.u8  q3, d6, d7
+    vshl.u16  q3, #4
 
-	//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
-	vmovl.s8  q0, d0
-	vmla.s16  q3, q0, q1
-	vmla.s16  q3, q2, d0[0]
+    //calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
+    vmovl.s8  q0, d0
+    vmla.s16  q3, q0, q1
+    vmla.s16  q3, q2, d0[0]
 
-	//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
-	vshl.s16  q8, q1, #3
-	vadd.s16  q8, q3
+    //Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
+    vshl.s16  q8, q1, #3
+    vadd.s16  q8, q3
 
-	//right shift 5 bits and rounding
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
+    //right shift 5 bits and rounding
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
 
-	//Set the line of MB
-	vst1.u32  {d0,d1}, [r0]!
+    //Set the line of MB
+    vst1.u32  {d0,d1}, [r0]!
 
 
-	//Do the same processing for setting other lines
-	mov  r3, #15
+    //Do the same processing for setting other lines
+    mov  r3, #15
 loop_0_get_i16x16_luma_pred_plane:
-	vadd.s16  q3, q2
-	vadd.s16  q8, q2
-	vqrshrun.s16 d0, q3, #5
-	vqrshrun.s16 d1, q8, #5
-	vst1.u32  {d0,d1}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i16x16_luma_pred_plane
+    vadd.s16  q3, q2
+    vadd.s16  q8, q2
+    vqrshrun.s16 d0, q3, #5
+    vqrshrun.s16 d1, q8, #5
+    vst1.u32  {d0,d1}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i16x16_luma_pred_plane
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub  r3, r1, r2
-	ldr  r3, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub  r3, r1, r2
+    ldr  r3, [r3]
 
-	//Set the luma MB using top line
-	str  r3, [r0], #4
-	str  r3, [r0], #4
-	str  r3, [r0], #4
-	str  r3, [r0]
+    //Set the luma MB using top line
+    str  r3, [r0], #4
+    str  r3, [r0], #4
+    str  r3, [r0], #4
+    str  r3, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the left column (4 bytes)
-	sub  r3, r1, #1
-	vld1.8 {d0[]}, [r3], r2
-	vld1.8 {d1[]}, [r3], r2
-	vld1.8 {d2[]}, [r3], r2
-	vld1.8 {d3[]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the left column (4 bytes)
+    sub  r3, r1, #1
+    vld1.8 {d0[]}, [r3], r2
+    vld1.8 {d1[]}, [r3], r2
+    vld1.8 {d2[]}, [r3], r2
+    vld1.8 {d3[]}, [r3]
 
-	//Set the luma MB using the left side byte
-	vst1.32 {d0[0]}, [r0]!
-	vst1.32 {d1[0]}, [r0]!
-	vst1.32 {d2[0]}, [r0]!
-	vst1.32 {d3[0]}, [r0]
+    //Set the luma MB using the left side byte
+    vst1.32 {d0[0]}, [r0]!
+    vst1.32 {d1[0]}, [r0]!
+    vst1.32 {d2[0]}, [r0]!
+    vst1.32 {d3[0]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data(8 bytes)
-	sub    r3,  r1, r2
-	vld1.32  {d0}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data(8 bytes)
+    sub    r3,  r1, r2
+    vld1.32  {d0}, [r3]
 
-	//For "t7 + (t7<<1)"
-	vdup.8   d1,  d0[7]
+    //For "t7 + (t7<<1)"
+    vdup.8   d1,  d0[7]
 
-	//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
-	vext.8   d1,  d0, d1, #1
-	vaddl.u8 q1,  d1, d0
+    //calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
+    vext.8   d1,  d0, d1, #1
+    vaddl.u8 q1,  d1, d0
 
-	//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
-	vext.8   q2,  q1, q1, #14
-	vadd.u16 q0,  q1, q2
+    //calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
+    vext.8   q2,  q1, q1, #14
+    vadd.u16 q0,  q1, q2
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16  d0,  q0, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16  d0,  q0, #2
 
-	//Save "ddl0, ddl1, ddl2, ddl3"
-	vext.8   d1, d0, d0, #1
-	vst1.32  d1[0], [r0]!
+    //Save "ddl0, ddl1, ddl2, ddl3"
+    vext.8   d1, d0, d0, #1
+    vst1.32  d1[0], [r0]!
 
-	//Save "ddl1, ddl2, ddl3, ddl4"
-	vext.8   d1, d0, d0, #2
-	vst1.32  d1[0], [r0]!
+    //Save "ddl1, ddl2, ddl3, ddl4"
+    vext.8   d1, d0, d0, #2
+    vst1.32  d1[0], [r0]!
 
-	//Save "ddl2, ddl3, ddl4, ddl5"
-	vext.8   d1, d0, d0, #3
-	vst1.32  d1[0], [r0]!
+    //Save "ddl2, ddl3, ddl4, ddl5"
+    vext.8   d1, d0, d0, #3
+    vst1.32  d1[0], [r0]!
 
-	//Save "ddl3, ddl4, ddl5, ddl6"
-	vst1.32  d0[1], [r0]
+    //Save "ddl3, ddl4, ddl5, ddl6"
+    vst1.32  d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub    r3,  r1, r2
-	vld1.32  {d0[1]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub    r3,  r1, r2
+    vld1.32  {d0[1]}, [r3]
 
-	//Load the left column (5 bytes)
-	sub    r3,  #1
-	vld1.8 {d0[3]}, [r3], r2
-	vld1.8 {d0[2]}, [r3], r2
-	vld1.8 {d0[1]}, [r3], r2
-	vld1.8 {d0[0]}, [r3], r2
-	vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
+    //Load the left column (5 bytes)
+    sub    r3,  #1
+    vld1.8 {d0[3]}, [r3], r2
+    vld1.8 {d0[2]}, [r3], r2
+    vld1.8 {d0[1]}, [r3], r2
+    vld1.8 {d0[0]}, [r3], r2
+    vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
 
 
-	vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
-	                          //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
+    vext.8   d2, d1, d0, #7   //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
+                              //d2:{L3,L2,L1,L0,LT,T0,T1,T2}
 
-	//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
-	vaddl.u8 q2, d2, d0
+    //q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
+    vaddl.u8 q2, d2, d0
 
-	//q1:{TL0+LT0,LT0+T01,...L12+L23}
-	vext.8   q3, q3, q2, #14
-	vadd.u16 q1, q2, q3
+    //q1:{TL0+LT0,LT0+T01,...L12+L23}
+    vext.8   q3, q3, q2, #14
+    vadd.u16 q1, q2, q3
 
-	//right shift 2 bits and rounding
-	vqrshrn.u16 d0, q1, #2
+    //right shift 2 bits and rounding
+    vqrshrn.u16 d0, q1, #2
 
-	//Adjust the data sequence for setting luma MB of 'pred'
-	vst1.32   d0[1], [r0]!
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]!
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]!
-	vext.8    d0, d0, d0, #7
-	vst1.32   d0[1], [r0]
+    //Adjust the data sequence for setting luma MB of 'pred'
+    vst1.32   d0[1], [r0]!
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]!
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]!
+    vext.8    d0, d0, d0, #7
+    vst1.32   d0[1], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (8 bytes)
-	sub    r3,  r1, r2
-	vld1.32  {d0}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (8 bytes)
+    sub    r3,  r1, r2
+    vld1.32  {d0}, [r3]
 
 
-	vext.8   d1,  d0, d0, #1
-	vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
+    vext.8   d1,  d0, d0, #1
+    vaddl.u8 q1,  d1, d0     //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
 
-	vext.8   q2,  q1, q1, #2
-	vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
+    vext.8   q2,  q1, q1, #2
+    vadd.u16 q2,  q1, q2     //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
 
-	//calculate the "vl0,vl1,vl2,vl3,vl4"
-	vqrshrn.u16  d0,  q1, #1
+    //calculate the "vl0,vl1,vl2,vl3,vl4"
+    vqrshrn.u16  d0,  q1, #1
 
-	//calculate the "vl5,vl6,vl7,vl8,vl9"
-	vqrshrn.u16  d1,  q2, #2
+    //calculate the "vl5,vl6,vl7,vl8,vl9"
+    vqrshrn.u16  d1,  q2, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[0], [r0]!
-	vst1.32  d1[0], [r0]!
-	vext.8   d0,  d0, d0, #1
-	vext.8   d1,  d1, d1, #1
-	vst1.32  d0[0], [r0]!
-	vst1.32  d1[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[0], [r0]!
+    vst1.32  d1[0], [r0]!
+    vext.8   d0,  d0, d0, #1
+    vext.8   d1,  d1, d1, #1
+    vst1.32  d0[0], [r0]!
+    vst1.32  d1[0], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row (4 bytes)
-	sub       r3,  r1, r2
-	vld1.32   {d0[1]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row (4 bytes)
+    sub       r3,  r1, r2
+    vld1.32   {d0[1]}, [r3]
 
-	//Load the left column (4 bytes)
-	sub       r3,  #1
-	vld1.8    {d0[3]}, [r3], r2
-	vld1.8    {d0[2]}, [r3], r2
-	vld1.8    {d0[1]}, [r3], r2
-	vld1.8    {d0[0]}, [r3]
+    //Load the left column (4 bytes)
+    sub       r3,  #1
+    vld1.8    {d0[3]}, [r3], r2
+    vld1.8    {d0[2]}, [r3], r2
+    vld1.8    {d0[1]}, [r3], r2
+    vld1.8    {d0[0]}, [r3]
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1      //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
 
-	vext.u8   q2, q1, q1, #14
-	vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
+    vext.u8   q2, q1, q1, #14
+    vadd.u16  q2, q2, q1      //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
 
-	//Calculate the vr0 ~ vr9
-	vqrshrn.u16 d1, q2, #2
-	vqrshrn.u16 d0, q1, #1
+    //Calculate the vr0 ~ vr9
+    vqrshrn.u16 d1, q2, #2
+    vqrshrn.u16 d0, q1, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vst1.32  d0[1], [r0]!
-	vst1.32  d1[1], [r0]!
-	//add    r2, r0, r1
-	vst1.8   d1[3], [r0]!
-	vst1.16  d0[2], [r0]!
-	vst1.8   d0[6], [r0]!
-	vst1.8   d1[2], [r0]!
-	vst1.16  d1[2], [r0]!
-	vst1.8   d1[6], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vst1.32  d0[1], [r0]!
+    vst1.32  d1[1], [r0]!
+    //add    r2, r0, r1
+    vst1.8   d1[3], [r0]!
+    vst1.16  d0[2], [r0]!
+    vst1.8   d0[6], [r0]!
+    vst1.8   d1[2], [r0]!
+    vst1.16  d1[2], [r0]!
+    vst1.8   d1[6], [r0]
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
-	//stmdb sp!, { r4, lr}
-	//Load the left column data
-	sub       r3,  r1, #1
-	mov       r1,  #3
-	mul       r1,  r2
-	add       r1,  r3
-	vld1.8    {d0[]},  [r1]
-	vld1.8    {d0[4]}, [r3], r2
-	vld1.8    {d0[5]}, [r3], r2
-	vld1.8    {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
+    //stmdb sp!, { r4, lr}
+    //Load the left column data
+    sub       r3,  r1, #1
+    mov       r1,  #3
+    mul       r1,  r2
+    add       r1,  r3
+    vld1.8    {d0[]},  [r1]
+    vld1.8    {d0[4]}, [r3], r2
+    vld1.8    {d0[5]}, [r3], r2
+    vld1.8    {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
 
-	vext.8    d1, d0, d0, #1
-	vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
+    vext.8    d1, d0, d0, #1
+    vaddl.u8  q2, d0, d1        //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
 
-	vext.u8   d2, d5, d4, #2
-	vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
+    vext.u8   d2, d5, d4, #2
+    vadd.u16  d3, d2, d5        //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
 
-	//Calculate the hu0 ~ hu5
-	vqrshrn.u16 d2, q2, #1
-	vqrshrn.u16 d1, q1, #2
+    //Calculate the hu0 ~ hu5
+    vqrshrn.u16 d2, q2, #1
+    vqrshrn.u16 d1, q1, #2
 
-	//Adjust the data sequence for setting the luma MB
-	vzip.8   d2, d1
-	vst1.32  d1[0], [r0]!
-	vext.8   d2, d1, d1, #2
-	vst1.32  d2[0], [r0]!
-	vst1.32  d1[1], [r0]!
-	vst1.32  d0[0], [r0]
+    //Adjust the data sequence for setting the luma MB
+    vzip.8   d2, d1
+    vst1.32  d1[0], [r0]!
+    vext.8   d2, d1, d1, #2
+    vst1.32  d2[0], [r0]!
+    vst1.32  d1[1], [r0]!
+    vst1.32  d0[0], [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the data
-	sub       r3,  r1, r2
-	sub       r3,  #1
-	vld1.32   {d0[1]}, [r3], r2
-	vld1.8    {d0[3]}, [r3], r2
-	vld1.8    {d0[2]}, [r3], r2
-	vld1.8    {d0[1]}, [r3], r2
-	vld1.8    {d0[0]}, [r3]	    //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
+    //stmdb sp!, { r2-r5, lr}
+    //Load the data
+    sub       r3,  r1, r2
+    sub       r3,  #1
+    vld1.32   {d0[1]}, [r3], r2
+    vld1.8    {d0[3]}, [r3], r2
+    vld1.8    {d0[2]}, [r3], r2
+    vld1.8    {d0[1]}, [r3], r2
+    vld1.8    {d0[0]}, [r3]     //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
 
 
-	vext.8    d1, d0, d0, #7
-	vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
+    vext.8    d1, d0, d0, #7
+    vaddl.u8  q1, d0, d1        //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
 
-	vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
-	vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
+    vext.u8   q2, q1, q1, #14   //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
+    vadd.u16  q3, q2, q1        //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
 
-	//Calculate the hd0~hd9
-	vqrshrn.u16 d1, q3, #2
-	vqrshrn.u16 d0, q2, #1
+    //Calculate the hd0~hd9
+    vqrshrn.u16 d1, q3, #2
+    vqrshrn.u16 d0, q2, #1
 
-	//Adjust the data sequence for setting the luma MB
-	vmov      d3, d1
-	vtrn.8    d0, d1
-	vext.u8   d2, d1, d1, #6
-	vst2.16  {d2[3], d3[3]}, [r0]!
-	vst2.16  {d0[2], d1[2]}, [r0]!
-	vmov     d3, d0
-	vst2.16  {d2[2], d3[2]}, [r0]!
-	vst2.16  {d0[1], d1[1]}, [r0]
+    //Adjust the data sequence for setting the luma MB
+    vmov      d3, d1
+    vtrn.8    d0, d1
+    vext.u8   d2, d1, d1, #6
+    vst2.16  {d2[3], d3[3]}, [r0]!
+    vst2.16  {d0[2], d1[2]}, [r0]!
+    vmov     d3, d0
+    vst2.16  {d2[2], d3[2]}, [r0]!
+    vst2.16  {d0[1], d1[1]}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Get the top row (8 byte)
-	sub  r3, r1, r2
-	vldr d0, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Get the top row (8 byte)
+    sub  r3, r1, r2
+    vldr d0, [r3]
 
-	//Set the chroma MB using top row data
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d0}, [r0]
+    //Set the chroma MB using top row data
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d0}, [r0]
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
-	//stmdb sp!, { r2-r5, lr}
-	////Get the left column (8 byte)
-	sub  r3, r1, #1
-	vld1.8 {d0[]}, [r3], r2
-	vld1.8 {d1[]}, [r3], r2
-	vld1.8 {d2[]}, [r3], r2
-	vld1.8 {d3[]}, [r3], r2
-	vld1.8 {d4[]}, [r3], r2
-	vld1.8 {d5[]}, [r3], r2
-	vld1.8 {d6[]}, [r3], r2
-	vld1.8 {d7[]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    ////Get the left column (8 byte)
+    sub  r3, r1, #1
+    vld1.8 {d0[]}, [r3], r2
+    vld1.8 {d1[]}, [r3], r2
+    vld1.8 {d2[]}, [r3], r2
+    vld1.8 {d3[]}, [r3], r2
+    vld1.8 {d4[]}, [r3], r2
+    vld1.8 {d5[]}, [r3], r2
+    vld1.8 {d6[]}, [r3], r2
+    vld1.8 {d7[]}, [r3]
 
-	//Set the chroma MB using left column data
-	vst1.8 {d0}, [r0]!
-	vst1.8 {d1}, [r0]!
-	vst1.8 {d2}, [r0]!
-	vst1.8 {d3}, [r0]!
-	vst1.8 {d4}, [r0]!
-	vst1.8 {d5}, [r0]!
-	vst1.8 {d6}, [r0]!
-	vst1.8 {d7}, [r0]
+    //Set the chroma MB using left column data
+    vst1.8 {d0}, [r0]!
+    vst1.8 {d1}, [r0]!
+    vst1.8 {d2}, [r0]!
+    vst1.8 {d3}, [r0]!
+    vst1.8 {d4}, [r0]!
+    vst1.8 {d5}, [r0]!
+    vst1.8 {d6}, [r0]!
+    vst1.8 {d7}, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -575,73 +575,73 @@
 CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
 
 WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
-	//stmdb sp!, { r2-r5, lr}
-	//Load the top row data
-	sub  r3, r1, #1
-	sub  r3, r2
-	vld1.32 {d1[0]}, [r3]
-	add  r3, #5
-	vld1.32 {d0[0]}, [r3]
+    //stmdb sp!, { r2-r5, lr}
+    //Load the top row data
+    sub  r3, r1, #1
+    sub  r3, r2
+    vld1.32 {d1[0]}, [r3]
+    add  r3, #5
+    vld1.32 {d0[0]}, [r3]
 
-	//Load the left column data
-	sub  r3, #5
-	vld1.8 {d1[4]}, [r3], r2
-	vld1.8 {d1[5]}, [r3], r2
-	vld1.8 {d1[6]}, [r3], r2
-	vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
-	add  r3, r2
-	vld1.8 {d0[4]}, [r3], r2
-	vld1.8 {d0[5]}, [r3], r2
-	vld1.8 {d0[6]}, [r3], r2
-	vld1.8 {d0[7]}, [r3]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
+    //Load the left column data
+    sub  r3, #5
+    vld1.8 {d1[4]}, [r3], r2
+    vld1.8 {d1[5]}, [r3], r2
+    vld1.8 {d1[6]}, [r3], r2
+    vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
+    add  r3, r2
+    vld1.8 {d0[4]}, [r3], r2
+    vld1.8 {d0[5]}, [r3], r2
+    vld1.8 {d0[6]}, [r3], r2
+    vld1.8 {d0[7]}, [r3]     //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
 
 
-	//Save T7 to d3 for next step
-	vdup.u8   d3,   d0[3]
-	//Save L7 to d4 for next step
-	vdup.u8   d4,   d0[7]
+    //Save T7 to d3 for next step
+    vdup.u8   d3,   d0[3]
+    //Save L7 to d4 for next step
+    vdup.u8   d4,   d0[7]
 
-	//Calculate the value of 'a' and save to q2
-	vaddl.u8  q2, d3, d4
-	vshl.u16  q2, #4
+    //Calculate the value of 'a' and save to q2
+    vaddl.u8  q2, d3, d4
+    vshl.u16  q2, #4
 
-	//Load the table {{1,2,3,4,1,2,3,4}*17}
-	adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d2}, [r3]
+    //Load the table {{1,2,3,4,1,2,3,4}*17}
+    adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d2}, [r3]
 
-	//Calculate the 'b','c', and save to q0
-	vrev32.8  d1, d1
-	vsubl.u8  q0, d0, d1
-	vmovl.u8   q1, d2
-	vmul.s16   q0, q1
-	vpaddl.s16 q0, q0
-	vpaddl.s32 q0, q0
-	vrshr.s64  q0, #5
+    //Calculate the 'b','c', and save to q0
+    vrev32.8  d1, d1
+    vsubl.u8  q0, d0, d1
+    vmovl.u8   q1, d2
+    vmul.s16   q0, q1
+    vpaddl.s16 q0, q0
+    vpaddl.s32 q0, q0
+    vrshr.s64  q0, #5
 
-	//Load the table {-3,-2,-1,0,1,2,3,4} to q3
-	adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
-	vld1.32   {d6, d7}, [r3]
+    //Load the table {-3,-2,-1,0,1,2,3,4} to q3
+    adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
+    vld1.32   {d6, d7}, [r3]
 
-	//Duplicate the 'b','c' to q0, q1 for SIMD instruction
-	vdup.s16   q1, d1[0]
-	vdup.s16   q0, d0[0]
+    //Duplicate the 'b','c' to q0, q1 for SIMD instruction
+    vdup.s16   q1, d1[0]
+    vdup.s16   q0, d0[0]
 
-	//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
-	vmla.s16   q2, q0, q3
-	vmla.s16   q2, q1, d6[0]
-	vqrshrun.s16 d0, q2, #5
+    //Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
+    vmla.s16   q2, q0, q3
+    vmla.s16   q2, q1, d6[0]
+    vqrshrun.s16 d0, q2, #5
 
-	//Set a line of chroma MB
-	vst1.u32  {d0}, [r0]!
+    //Set a line of chroma MB
+    vst1.u32  {d0}, [r0]!
 
-	//Do the same processing for each line.
-	mov  r3, #7
+    //Do the same processing for each line.
+    mov  r3, #7
 loop_0_get_i_chroma_pred_plane:
-	vadd.s16   q2, q1
-	vqrshrun.s16 d0, q2, #5
-	vst1.u32  {d0}, [r0]!
-	subs  r3, #1
-	bne  loop_0_get_i_chroma_pred_plane
+    vadd.s16   q2, q1
+    vqrshrun.s16 d0, q2, #5
+    vst1.u32  {d0}, [r0]!
+    subs  r3, #1
+    bne  loop_0_get_i_chroma_pred_plane
 
 WELS_ASM_FUNC_END
 
--- a/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
+++ b/codec/encoder/core/arm/intra_pred_sad_3_opt_neon.S
@@ -38,59 +38,59 @@
 #ifdef __APPLE__
  //The data sequence will be used
 .macro GET_8BYTE_DATA_L0
-	vld1.8 {$0[0]}, [$1], $2
-	vld1.8 {$0[1]}, [$1], $2
-	vld1.8 {$0[2]}, [$1], $2
-	vld1.8 {$0[3]}, [$1], $2
-	vld1.8 {$0[4]}, [$1], $2
-	vld1.8 {$0[5]}, [$1], $2
-	vld1.8 {$0[6]}, [$1], $2
-	vld1.8 {$0[7]}, [$1], $2
+    vld1.8 {$0[0]}, [$1], $2
+    vld1.8 {$0[1]}, [$1], $2
+    vld1.8 {$0[2]}, [$1], $2
+    vld1.8 {$0[3]}, [$1], $2
+    vld1.8 {$0[4]}, [$1], $2
+    vld1.8 {$0[5]}, [$1], $2
+    vld1.8 {$0[6]}, [$1], $2
+    vld1.8 {$0[7]}, [$1], $2
 .endm
 
 
 .macro HDM_TRANSFORM_4X4_L0
 
-	//Do the vertical transform
-	vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
-	vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
-	vswp  d1, d2
-	vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
-	vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+    //Do the vertical transform
+    vaddl.u8 q0, $0, $1 //{0,4,8,12,1,5,9,13}
+    vsubl.u8 q1, $0, $1 //{2,6,10,14,3,7,11,15}
+    vswp  d1, d2
+    vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+    vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
 
-	//Do the horizontal transform
-	vtrn.32 q2, q1
-	vadd.s16 q0, q2, q1
-	vsub.s16 q1, q2, q1
+    //Do the horizontal transform
+    vtrn.32 q2, q1
+    vadd.s16 q0, q2, q1
+    vsub.s16 q1, q2, q1
 
-	vtrn.16 q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
+    vtrn.16 q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
 
-	vmov.s16 d0, d4
-	vmov.s16 d1, d2
+    vmov.s16 d0, d4
+    vmov.s16 d1, d2
 
-	vabs.s16 d3, d3
+    vabs.s16 d3, d3
 
-	//16x16_v
-	vtrn.32 d0, d1 //{0,1,3,2}
-	vaba.s16 $5, d0, $2 //16x16_v
-	vaba.s16 $5, d1, $8
-	vaba.s16 $5, d5, $8
-	vadd.u16 $5, d3
+    //16x16_v
+    vtrn.32 d0, d1 //{0,1,3,2}
+    vaba.s16 $5, d0, $2 //16x16_v
+    vaba.s16 $5, d1, $8
+    vaba.s16 $5, d5, $8
+    vadd.u16 $5, d3
 
-	//16x16_h
-	vtrn.16 d4, d5 //{0,4,12,8}
-	vaba.s16 $6, d4, $3 //16x16_h
-	vabs.s16 d2, d2
-	vabs.s16 d5, d5
-	vadd.u16 d2, d3
-	vadd.u16 d2, d5
-	vadd.u16 $6, d2
+    //16x16_h
+    vtrn.16 d4, d5 //{0,4,12,8}
+    vaba.s16 $6, d4, $3 //16x16_h
+    vabs.s16 d2, d2
+    vabs.s16 d5, d5
+    vadd.u16 d2, d3
+    vadd.u16 d2, d5
+    vadd.u16 $6, d2
 
-	//16x16_dc_both
-	vaba.s16 $7, d4, $4 //16x16_dc_both
-	vadd.u16 $7, d2
+    //16x16_dc_both
+    vaba.s16 $7, d4, $4 //16x16_dc_both
+    vadd.u16 $7, d2
 
 .endm
 
@@ -97,58 +97,58 @@
 #else
  //The data sequence will be used
 .macro GET_8BYTE_DATA_L0 arg0, arg1, arg2
-	vld1.8 {\arg0[0]}, [\arg1], \arg2
-	vld1.8 {\arg0[1]}, [\arg1], \arg2
-	vld1.8 {\arg0[2]}, [\arg1], \arg2
-	vld1.8 {\arg0[3]}, [\arg1], \arg2
-	vld1.8 {\arg0[4]}, [\arg1], \arg2
-	vld1.8 {\arg0[5]}, [\arg1], \arg2
-	vld1.8 {\arg0[6]}, [\arg1], \arg2
-	vld1.8 {\arg0[7]}, [\arg1], \arg2
+    vld1.8 {\arg0[0]}, [\arg1], \arg2
+    vld1.8 {\arg0[1]}, [\arg1], \arg2
+    vld1.8 {\arg0[2]}, [\arg1], \arg2
+    vld1.8 {\arg0[3]}, [\arg1], \arg2
+    vld1.8 {\arg0[4]}, [\arg1], \arg2
+    vld1.8 {\arg0[5]}, [\arg1], \arg2
+    vld1.8 {\arg0[6]}, [\arg1], \arg2
+    vld1.8 {\arg0[7]}, [\arg1], \arg2
 .endm
 
 .macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2,arg3, arg4, arg5, arg6, arg7, arg8
 
-	//Do the vertical transform
-	vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
-	vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
-	vswp  d1, d2
-	vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
-	vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
+    //Do the vertical transform
+    vaddl.u8 q0, \arg0, \arg1 //{0,4,8,12,1,5,9,13}
+    vsubl.u8 q1, \arg0, \arg1 //{2,6,10,14,3,7,11,15}
+    vswp  d1, d2
+    vadd.s16 q2, q0, q1 //{0,1,2,3,4,5,6,7}
+    vsub.s16 q1, q0, q1 //{12,13,14,15,8,9,10,11}
 
-	//Do the horizontal transform
-	vtrn.32 q2, q1
-	vadd.s16 q0, q2, q1
-	vsub.s16 q1, q2, q1
+    //Do the horizontal transform
+    vtrn.32 q2, q1
+    vadd.s16 q0, q2, q1
+    vsub.s16 q1, q2, q1
 
-	vtrn.16 q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
+    vtrn.16 q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
 
-	vmov.s16 d0, d4
-	vmov.s16 d1, d2
+    vmov.s16 d0, d4
+    vmov.s16 d1, d2
 
-	vabs.s16 d3, d3
+    vabs.s16 d3, d3
 
-	//16x16_v
-	vtrn.32 d0, d1 //{0,1,3,2}
-	vaba.s16 \arg5, d0, \arg2 //16x16_v
-	vaba.s16 \arg5, d1, \arg8
-	vaba.s16 \arg5, d5, \arg8
-	vadd.u16 \arg5, d3
+    //16x16_v
+    vtrn.32 d0, d1 //{0,1,3,2}
+    vaba.s16 \arg5, d0, \arg2 //16x16_v
+    vaba.s16 \arg5, d1, \arg8
+    vaba.s16 \arg5, d5, \arg8
+    vadd.u16 \arg5, d3
 
-	//16x16_h
-	vtrn.16 d4, d5 //{0,4,12,8}
-	vaba.s16 \arg6, d4, \arg3 //16x16_h
-	vabs.s16 d2, d2
-	vabs.s16 d5, d5
-	vadd.u16 d2, d3
-	vadd.u16 d2, d5
-	vadd.u16 \arg6, d2
+    //16x16_h
+    vtrn.16 d4, d5 //{0,4,12,8}
+    vaba.s16 \arg6, d4, \arg3 //16x16_h
+    vabs.s16 d2, d2
+    vabs.s16 d5, d5
+    vadd.u16 d2, d3
+    vadd.u16 d2, d5
+    vadd.u16 \arg6, d2
 
-	//16x16_dc_both
-	vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
-	vadd.u16 \arg7, d2
+    //16x16_dc_both
+    vaba.s16 \arg7, d4, \arg4 //16x16_dc_both
+    vadd.u16 \arg7, d2
 .endm
 #endif
 
@@ -156,63 +156,63 @@
     stmdb sp!, {r4-r7, lr}
     vpush {q4-q7}
 
-	//Get the top line data to 'q15'(16 bytes)
-	sub  r7, r0, r1
+    //Get the top line data to 'q15'(16 bytes)
+    sub  r7, r0, r1
     vld1.8 {q15}, [r7]
 
-	//Get the left colume data to 'q14' (16 bytes)
-	sub  r7, r0, #1
-	GET_8BYTE_DATA_L0 d28, r7, r1
-	GET_8BYTE_DATA_L0 d29, r7, r1
+    //Get the left colume data to 'q14' (16 bytes)
+    sub  r7, r0, #1
+    GET_8BYTE_DATA_L0 d28, r7, r1
+    GET_8BYTE_DATA_L0 d29, r7, r1
 
-	//Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
-	//Calculate the 16x16_dc_both mode SATD
-	vaddl.u8 q0, d30, d31
-	vaddl.u8 q1, d28, d29
-	vadd.u16 q0, q1
-	vadd.u16 d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the mean value and save to 'q13->d27(reserve the d26)' (2 bytes)
+    //Calculate the 16x16_dc_both mode SATD
+    vaddl.u8 q0, d30, d31
+    vaddl.u8 q1, d28, d29
+    vadd.u16 q0, q1
+    vadd.u16 d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, #5
-	vshl.u16   d27, d0, #4
+    //Calculate the mean value
+    vrshr.u16  d0, #5
+    vshl.u16   d27, d0, #4
 
 
-	//Calculate the 16x16_v mode SATD and save to "q11, 12"
-	vshll.u8 q0, d30, #2
-	vshll.u8 q1, d31, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q12, q2, q1
-	vsub.s16 q11, q2, q1
-	vtrn.32  q12, q11 //{0,1,3,2, 4,5,7,6} q12
-	                  //{8,9,11,10, 12,13,15,14} q11
+    //Calculate the 16x16_v mode SATD and save to "q11, 12"
+    vshll.u8 q0, d30, #2
+    vshll.u8 q1, d31, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q12, q2, q1
+    vsub.s16 q11, q2, q1
+    vtrn.32  q12, q11 //{0,1,3,2, 4,5,7,6} q12
+                      //{8,9,11,10, 12,13,15,14} q11
     //Calculate the 16x16_h mode SATD and save to "q9, q10"
-	vshll.u8 q0, d28, #2
-	vshll.u8 q1, d29, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q10, q2, q1
-	vsub.s16 q9,  q2, q1
-	vtrn.32  q10, q9  //{0,1,3,2, 4,5,7,6} q10
-	                  //{8,9,11,10, 12,13,15,14} q9
+    vshll.u8 q0, d28, #2
+    vshll.u8 q1, d29, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q10, q2, q1
+    vsub.s16 q9,  q2, q1
+    vtrn.32  q10, q9  //{0,1,3,2, 4,5,7,6} q10
+                      //{8,9,11,10, 12,13,15,14} q9
 
-	vmov.i32 d17, #0//Save the SATD of DC_BOTH
-	vmov.i32 d16, #0//Save the SATD of H
-	vmov.i32 d15, #0//Save the SATD of V
-	vmov.i32 d14, #0//For zero D register
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    vmov.i32 d17, #0//Save the SATD of DC_BOTH
+    vmov.i32 d16, #0//Save the SATD of H
+    vmov.i32 d15, #0//Save the SATD of V
+    vmov.i32 d14, #0//For zero D register
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d20, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d20, d27, d15, d16, d17, d14
@@ -219,13 +219,13 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d20, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d20, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d21, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d21, d27, d15, d16, d17, d14
@@ -232,13 +232,13 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d21, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d21, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d18, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d18, d27, d15, d16, d17, d14
@@ -245,13 +245,13 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d18, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d18, d27, d15, d16, d17, d14
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {q3}, [r2], r3
-	vld1.32  {q4}, [r2], r3
-	vld1.32  {q5}, [r2], r3
-	vld1.32  {q6}, [r2], r3
-	vtrn.32  q3, q4
-	vtrn.32  q5, q6
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {q3}, [r2], r3
+    vld1.32  {q4}, [r2], r3
+    vld1.32  {q5}, [r2], r3
+    vld1.32  {q6}, [r2], r3
+    vtrn.32  q3, q4
+    vtrn.32  q5, q6
 
     HDM_TRANSFORM_4X4_L0 d6, d10, d24, d19, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d7, d11, d22, d19, d27, d15, d16, d17, d14
@@ -258,29 +258,29 @@
     HDM_TRANSFORM_4X4_L0 d8, d12, d25, d19, d27, d15, d16, d17, d14
     HDM_TRANSFORM_4X4_L0 d9, d13, d23, d19, d27, d15, d16, d17, d14
 
-	//Get the data from stack
-	ldr r5, [sp, #84] //the addr of Best_mode
-	ldr r6, [sp, #88] //the value of i_lambda
+    //Get the data from stack
+    ldr r5, [sp, #84] //the addr of Best_mode
+    ldr r6, [sp, #88] //the value of i_lambda
 
-	//vadd.u16   d24, d25
-	vrshr.u16  d15, #1
-	vpaddl.u16 d15, d15
-	vpaddl.u32 d15, d15
-	vmov.u32   r0, d15[0]
+    //vadd.u16   d24, d25
+    vrshr.u16  d15, #1
+    vpaddl.u16 d15, d15
+    vpaddl.u32 d15, d15
+    vmov.u32   r0, d15[0]
 
-	//vadd.u16   d22, d23
-	vrshr.u16  d16, #1
-	vpaddl.u16 d16, d16
-	vpaddl.u32 d16, d16
-	vmov.u32   r1, d16[0]
-	add  r1, r1, r6, lsl #1
+    //vadd.u16   d22, d23
+    vrshr.u16  d16, #1
+    vpaddl.u16 d16, d16
+    vpaddl.u32 d16, d16
+    vmov.u32   r1, d16[0]
+    add  r1, r1, r6, lsl #1
 
-	//vadd.u16   d20, d21
-	vrshr.u16  d17, #1
-	vpaddl.u16 d17, d17
-	vpaddl.u32 d17, d17
-	vmov.u32   r2, d17[0]
-	add  r2, r2, r6, lsl #1
+    //vadd.u16   d20, d21
+    vrshr.u16  d17, #1
+    vpaddl.u16 d17, d17
+    vpaddl.u32 d17, d17
+    vmov.u32   r2, d17[0]
+    add  r2, r2, r6, lsl #1
 
     mov r4, #0
     cmp r1, r0
@@ -300,77 +300,77 @@
 WELS_ASM_FUNC_BEGIN WelsIntra16x16Combined3Sad_neon
     stmdb sp!, {r4-r7, lr}
 
-	//Get the top line data to 'q15'(16 bytes)
-	sub  r4, r0, r1
+    //Get the top line data to 'q15'(16 bytes)
+    sub  r4, r0, r1
     vld1.8 {q15}, [r4]
 
-	//Get the left colume data to 'q14' (16 bytes)
-	sub  r4, r0, #1
-	GET_8BYTE_DATA_L0 d28, r4, r1
-	GET_8BYTE_DATA_L0 d29, r4, r1
+    //Get the left colume data to 'q14' (16 bytes)
+    sub  r4, r0, #1
+    GET_8BYTE_DATA_L0 d28, r4, r1
+    GET_8BYTE_DATA_L0 d29, r4, r1
 
-	//Calculate the mean value and save to 'q13' (8 bytes)
-	//Calculate the 16x16_dc_both mode SATD
-	vaddl.u8 q0, d30, d31
-	vaddl.u8 q1, d28, d29
-	vadd.u16 q0, q1
-	vadd.u16 d0, d1
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
+    //Calculate the mean value and save to 'q13' (8 bytes)
+    //Calculate the 16x16_dc_both mode SATD
+    vaddl.u8 q0, d30, d31
+    vaddl.u8 q1, d28, d29
+    vadd.u16 q0, q1
+    vadd.u16 d0, d1
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
 
-	//Calculate the mean value
-	vrshr.u16  d0, d0, #5
-	vdup.8     q13, d0[0]
+    //Calculate the mean value
+    vrshr.u16  d0, d0, #5
+    vdup.8     q13, d0[0]
 
-	sub  r4, r0, #1
+    sub  r4, r0, #1
 
-	vmov.i32 q12, #0//Save the SATD of DC_BOTH
-	vmov.i32 q11, #0//Save the SATD of H
-	vmov.i32 q10, #0//Save the SATD of V
+    vmov.i32 q12, #0//Save the SATD of DC_BOTH
+    vmov.i32 q11, #0//Save the SATD of H
+    vmov.i32 q10, #0//Save the SATD of V
 
-	mov lr, #16
+    mov lr, #16
 sad_intra_16x16_x3_opt_loop0:
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d0[]}, [r4], r1
+    vld1.8 {d0[]}, [r4], r1
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
-	vld1.8  {q1}, [r2], r3
+    //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+    vld1.8  {q1}, [r2], r3
 
-	subs lr, #1
-	//Do the SAD for top colume
-	vabal.u8  q12, d30, d2
-	vabal.u8  q12, d31, d3
+    subs lr, #1
+    //Do the SAD for top colume
+    vabal.u8  q12, d30, d2
+    vabal.u8  q12, d31, d3
 
-	//Do the SAD for left colume
-	vabal.u8  q11, d0, d2
-	vabal.u8  q11, d0, d3
+    //Do the SAD for left colume
+    vabal.u8  q11, d0, d2
+    vabal.u8  q11, d0, d3
 
-	//Do the SAD for mean value
-	vabal.u8  q10, d26, d2
-	vabal.u8  q10, d26, d3
+    //Do the SAD for mean value
+    vabal.u8  q10, d26, d2
+    vabal.u8  q10, d26, d3
 
-	bne sad_intra_16x16_x3_opt_loop0
+    bne sad_intra_16x16_x3_opt_loop0
 
-	//Get the data from stack
-	ldr r5, [sp, #20] //the addr of Best_mode
-	ldr r6, [sp, #24] //the value of i_lambda
+    //Get the data from stack
+    ldr r5, [sp, #20] //the addr of Best_mode
+    ldr r6, [sp, #24] //the value of i_lambda
 
-	vadd.u16   d24, d25
-	vpaddl.u16 d24, d24
-	vpaddl.u32 d24, d24
-	vmov.u32   r0, d24[0]
+    vadd.u16   d24, d25
+    vpaddl.u16 d24, d24
+    vpaddl.u32 d24, d24
+    vmov.u32   r0, d24[0]
 
-	vadd.u16   d22, d23
-	vpaddl.u16 d22, d22
-	vpaddl.u32 d22, d22
-	vmov.u32   r1, d22[0]
-	add  r1, r1, r6, lsl #1
+    vadd.u16   d22, d23
+    vpaddl.u16 d22, d22
+    vpaddl.u32 d22, d22
+    vmov.u32   r1, d22[0]
+    add  r1, r1, r6, lsl #1
 
-	vadd.u16   d20, d21
-	vpaddl.u16 d20, d20
-	vpaddl.u32 d20, d20
-	vmov.u32   r2, d20[0]
-	add  r2, r2, r6, lsl #1
+    vadd.u16   d20, d21
+    vpaddl.u16 d20, d20
+    vpaddl.u32 d20, d20
+    vmov.u32   r2, d20[0]
+    add  r2, r2, r6, lsl #1
 
     mov r4, #0
     cmp r1, r0
@@ -382,7 +382,7 @@
 
     str r4, [r5]
 
-	ldmia sp!, {r4-r7, lr}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 
@@ -389,24 +389,24 @@
 WELS_ASM_FUNC_BEGIN WelsIntra8x8Combined3Sad_neon
     stmdb sp!, {r4-r7, lr}
 
-	//Get the data from stack
-	ldr r4, [sp, #32] //p_dec_cr
-	ldr r5, [sp, #36] //p_enc_cr
+    //Get the data from stack
+    ldr r4, [sp, #32] //p_dec_cr
+    ldr r5, [sp, #36] //p_enc_cr
 
-	//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
-	sub  r6, r0, #1
-	GET_8BYTE_DATA_L0 d28, r6, r1
-	sub  r6, r4, #1
-	GET_8BYTE_DATA_L0 d30, r6, r1
+    //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+    sub  r6, r0, #1
+    GET_8BYTE_DATA_L0 d28, r6, r1
+    sub  r6, r4, #1
+    GET_8BYTE_DATA_L0 d30, r6, r1
 
-	//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
-	sub  r6, r0, r1
+    //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+    sub  r6, r0, r1
     vld1.8 {d29}, [r6]
-	sub  r6, r4, r1
+    sub  r6, r4, r1
     vld1.8 {d31}, [r6]
 
-	//Calculate the sum of left column and top row
-	vmov.i32   q0, q14
+    //Calculate the sum of left column and top row
+    vmov.i32   q0, q14
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
@@ -416,13 +416,13 @@
     //duplicate the 'mx' to a vector line
     vdup.8     d27, d2[0]
     vdup.8     d26, d1[4]
-	vtrn.32    d27, d26
+    vtrn.32    d27, d26
 
     vdup.8     d26, d0[4]
     vdup.8     d25, d2[4]
     vtrn.32    d26, d25   //Save to "d27, d26"
 
-	vmov.i32   q0, q15
+    vmov.i32   q0, q15
     vpaddl.u8  q0, q0
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1 //'m1' save to d2
@@ -432,94 +432,94 @@
     //duplicate the 'mx' to a vector line
     vdup.8     d25, d2[0]
     vdup.8     d24, d1[4]
-	vtrn.32    d25, d24
+    vtrn.32    d25, d24
 
     vdup.8     d24, d0[4]
     vdup.8     d23, d2[4]
-	vtrn.32    d24, d23   //Save to "d25, d24"
+    vtrn.32    d24, d23   //Save to "d25, d24"
 
-	vmov.i32 q11, #0//Save the SATD of DC_BOTH
-	vmov.i32 q10, #0//Save the SATD of H
-	vmov.i32 q9 , #0//Save the SATD of V
-	sub  r6, r0, #1
-	sub  r7, r4, #1
-	mov lr, #4
+    vmov.i32 q11, #0//Save the SATD of DC_BOTH
+    vmov.i32 q10, #0//Save the SATD of H
+    vmov.i32 q9 , #0//Save the SATD of V
+    sub  r6, r0, #1
+    sub  r7, r4, #1
+    mov lr, #4
 sad_intra_8x8_x3_opt_loop0:
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
-	vld1.8  {d0}, [r2], r3
-	vld1.8  {d1}, [r5], r3
+    //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+    vld1.8  {d0}, [r2], r3
+    vld1.8  {d1}, [r5], r3
 
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d2[]}, [r6], r1
-	vld1.8 {d3[]}, [r7], r1
+    vld1.8 {d2[]}, [r6], r1
+    vld1.8 {d3[]}, [r7], r1
 
-	subs lr, #1
+    subs lr, #1
 
 
-	//Do the SAD for top colume
-	vabal.u8  q11, d29, d0
-	vabal.u8  q11, d31, d1
+    //Do the SAD for top colume
+    vabal.u8  q11, d29, d0
+    vabal.u8  q11, d31, d1
 
-	//Do the SAD for left colume
-	vabal.u8  q10, d2, d0
-	vabal.u8  q10, d3, d1
+    //Do the SAD for left colume
+    vabal.u8  q10, d2, d0
+    vabal.u8  q10, d3, d1
 
-	//Do the SAD for mean value
-	vabal.u8  q9, d27, d0
-	vabal.u8  q9, d25, d1
+    //Do the SAD for mean value
+    vabal.u8  q9, d27, d0
+    vabal.u8  q9, d25, d1
 
 
-	bne sad_intra_8x8_x3_opt_loop0
+    bne sad_intra_8x8_x3_opt_loop0
 
-	mov lr, #4
+    mov lr, #4
 sad_intra_8x8_x3_opt_loop1:
 
-	//Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
-	vld1.8  {d0}, [r2], r3
-	vld1.8  {d1}, [r5], r3
+    //Load the p_enc data and save to "q1 ~ q2"--- 16X4 bytes
+    vld1.8  {d0}, [r2], r3
+    vld1.8  {d1}, [r5], r3
 
     //Get the left colume data to 'd0' (16 bytes)
-	vld1.8 {d2[]}, [r6], r1
-	vld1.8 {d3[]}, [r7], r1
+    vld1.8 {d2[]}, [r6], r1
+    vld1.8 {d3[]}, [r7], r1
 
-	subs lr, #1
+    subs lr, #1
 
 
-	//Do the SAD for top colume
-	vabal.u8  q11, d29, d0
-	vabal.u8  q11, d31, d1
+    //Do the SAD for top colume
+    vabal.u8  q11, d29, d0
+    vabal.u8  q11, d31, d1
 
-	//Do the SAD for left colume
-	vabal.u8  q10, d2, d0
-	vabal.u8  q10, d3, d1
+    //Do the SAD for left colume
+    vabal.u8  q10, d2, d0
+    vabal.u8  q10, d3, d1
 
-	//Do the SAD for mean value
-	vabal.u8  q9, d26, d0
-	vabal.u8  q9, d24, d1
+    //Do the SAD for mean value
+    vabal.u8  q9, d26, d0
+    vabal.u8  q9, d24, d1
 
 
-	bne sad_intra_8x8_x3_opt_loop1
-	//Get the data from stack
-	ldr r5, [sp, #20] //the addr of Best_mode
-	ldr r6, [sp, #24] //the value of i_lambda
+    bne sad_intra_8x8_x3_opt_loop1
+    //Get the data from stack
+    ldr r5, [sp, #20] //the addr of Best_mode
+    ldr r6, [sp, #24] //the value of i_lambda
 
-	vadd.u16   d22, d23
-	vpaddl.u16 d22, d22
-	vpaddl.u32 d22, d22
-	vmov.u32   r0, d22[0]
-	add  r0, r0, r6, lsl #1
+    vadd.u16   d22, d23
+    vpaddl.u16 d22, d22
+    vpaddl.u32 d22, d22
+    vmov.u32   r0, d22[0]
+    add  r0, r0, r6, lsl #1
 
-	vadd.u16   d20, d21
-	vpaddl.u16 d20, d20
-	vpaddl.u32 d20, d20
-	vmov.u32   r1, d20[0]
-	add  r1, r1, r6, lsl #1
+    vadd.u16   d20, d21
+    vpaddl.u16 d20, d20
+    vpaddl.u32 d20, d20
+    vmov.u32   r1, d20[0]
+    add  r1, r1, r6, lsl #1
 
-	vadd.u16   d18, d19
-	vpaddl.u16 d18, d18
-	vpaddl.u32 d18, d18
-	vmov.u32   r2, d18[0]
+    vadd.u16   d18, d19
+    vpaddl.u16 d18, d18
+    vpaddl.u32 d18, d18
+    vmov.u32   r2, d18[0]
 
     mov r4, #2
     cmp r1, r0
@@ -531,7 +531,7 @@
 
     str r4, [r5]
 
-	ldmia sp!, {r4-r7, lr}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 
@@ -539,47 +539,47 @@
     stmdb sp!, {r4-r7, lr}
     vpush {q4-q7}
 
-	//Get the data from stack
-	ldr r4, [sp, #96] //p_dec_cr
-	ldr r5, [sp, #100] //p_enc_cr
+    //Get the data from stack
+    ldr r4, [sp, #96] //p_dec_cr
+    ldr r5, [sp, #100] //p_enc_cr
 
-	//Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
-	sub  r6, r0, r1
+    //Get the top line data to 'd29(cb), d31(cr)'(16 bytes)
+    sub  r6, r0, r1
     vld1.8 {d29}, [r6]
-	sub  r6, r4, r1
+    sub  r6, r4, r1
     vld1.8 {d31}, [r6]
 
-	//Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
-	sub  r6, r0, #1
-	GET_8BYTE_DATA_L0 d28, r6, r1
-	sub  r6, r4, #1
-	GET_8BYTE_DATA_L0 d30, r6, r1
+    //Get the left colume data to 'd28(cb), d30(cr)' (16 bytes)
+    sub  r6, r0, #1
+    GET_8BYTE_DATA_L0 d28, r6, r1
+    sub  r6, r4, #1
+    GET_8BYTE_DATA_L0 d30, r6, r1
 
-	//Calculate the 16x16_v mode SATD and save to "q12, 13"
-	vshll.u8 q0, d29, #2
-	vshll.u8 q1, d31, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q13, q2, q1
-	vsub.s16 q12, q2, q1
-	vtrn.32  q13, q12 //{0,1,3,2, 4,5,7,6} q13
-	                  //{8,9,11,10, 12,13,15,14} q12
+    //Calculate the 16x16_v mode SATD and save to "q12, 13"
+    vshll.u8 q0, d29, #2
+    vshll.u8 q1, d31, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q13, q2, q1
+    vsub.s16 q12, q2, q1
+    vtrn.32  q13, q12 //{0,1,3,2, 4,5,7,6} q13
+                      //{8,9,11,10, 12,13,15,14} q12
     //Calculate the 16x16_h mode SATD and save to "q10, q11"
-	vshll.u8 q0, d28, #2
-	vshll.u8 q1, d30, #2
-	vtrn.32  q0, q1
-	vadd.s16 q2, q0, q1
-	vsub.s16 q1, q0, q1
-	vtrn.16  q2, q1
-	vadd.s16 q11, q2, q1
-	vsub.s16 q10,  q2, q1
-	vtrn.32  q11, q10  //{0,1,3,2, 4,5,7,6} q11
-	                   //{8,9,11,10, 12,13,15,14} q10
+    vshll.u8 q0, d28, #2
+    vshll.u8 q1, d30, #2
+    vtrn.32  q0, q1
+    vadd.s16 q2, q0, q1
+    vsub.s16 q1, q0, q1
+    vtrn.16  q2, q1
+    vadd.s16 q11, q2, q1
+    vsub.s16 q10,  q2, q1
+    vtrn.32  q11, q10  //{0,1,3,2, 4,5,7,6} q11
+                       //{8,9,11,10, 12,13,15,14} q10
 
-	//Calculate the sum of left column and top row
-	//vmov.i32   q0, q14
+    //Calculate the sum of left column and top row
+    //vmov.i32   q0, q14
     vpaddl.u8  q0, q14
     vpaddl.u16 q0, q0
     vadd.u32   d2, d0, d1
@@ -588,77 +588,77 @@
     vpaddl.u16 q2, q2
     vadd.u32   d3, d4, d5
 
-	vtrn.32    q0, q2
-	vrshr.u32  q1, #3
-	vrshr.u32  q2, #2
-	vshll.u32  q9, d4, #4 // {2cb, 2cr} q9
-	vshll.u32  q8, d5, #4 // {1cb, 1cr} q8
-	vshll.u32  q7, d2, #4 // {0cb, 3cb} q7
-	vshll.u32  q6, d3, #4 // {0cr, 3cr} q6
+    vtrn.32    q0, q2
+    vrshr.u32  q1, #3
+    vrshr.u32  q2, #2
+    vshll.u32  q9, d4, #4 // {2cb, 2cr} q9
+    vshll.u32  q8, d5, #4 // {1cb, 1cr} q8
+    vshll.u32  q7, d2, #4 // {0cb, 3cb} q7
+    vshll.u32  q6, d3, #4 // {0cr, 3cr} q6
 
 
     vmov.i32 d28, #0//Save the SATD of DC_BOTH
-	vmov.i32 d10, #0//Save the SATD of H
-	vmov.i32 d11, #0//Save the SATD of V
-	vmov.i32 d30, #0//For zero D register
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {d6}, [r2], r3
-	vld1.32  {d7}, [r2], r3
-	vld1.32  {d8}, [r2], r3
-	vld1.32  {d9}, [r2], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    vmov.i32 d10, #0//Save the SATD of H
+    vmov.i32 d11, #0//Save the SATD of V
+    vmov.i32 d30, #0//For zero D register
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {d6}, [r2], r3
+    vld1.32  {d7}, [r2], r3
+    vld1.32  {d8}, [r2], r3
+    vld1.32  {d9}, [r2], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d26, d22, d14, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d27, d22, d16, d11, d10, d28, d30
 
-	vld1.32  {d6}, [r5], r3
-	vld1.32  {d7}, [r5], r3
-	vld1.32  {d8}, [r5], r3
-	vld1.32  {d9}, [r5], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    vld1.32  {d6}, [r5], r3
+    vld1.32  {d7}, [r5], r3
+    vld1.32  {d8}, [r5], r3
+    vld1.32  {d9}, [r5], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d24, d20, d12, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d25, d20, d17, d11, d10, d28, d30
 
-	//Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
-	vld1.32  {d6}, [r2], r3
-	vld1.32  {d7}, [r2], r3
-	vld1.32  {d8}, [r2], r3
-	vld1.32  {d9}, [r2], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    //Load the p_enc data and save to "q3 ~ q6"--- 16X4 bytes
+    vld1.32  {d6}, [r2], r3
+    vld1.32  {d7}, [r2], r3
+    vld1.32  {d8}, [r2], r3
+    vld1.32  {d9}, [r2], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d26, d23, d18, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d27, d23, d15, d11, d10, d28, d30
 
-	vld1.32  {d6}, [r5], r3
-	vld1.32  {d7}, [r5], r3
-	vld1.32  {d8}, [r5], r3
-	vld1.32  {d9}, [r5], r3
-	vtrn.32  d6, d7
-	vtrn.32  d8, d9
+    vld1.32  {d6}, [r5], r3
+    vld1.32  {d7}, [r5], r3
+    vld1.32  {d8}, [r5], r3
+    vld1.32  {d9}, [r5], r3
+    vtrn.32  d6, d7
+    vtrn.32  d8, d9
     HDM_TRANSFORM_4X4_L0 d6, d8, d24, d21, d19, d11, d10, d28, d30
     HDM_TRANSFORM_4X4_L0 d7, d9, d25, d21, d13, d11, d10, d28, d30
 
-	//Get the data from stack
-	ldr r5, [sp, #84] //the addr of Best_mode
-	ldr r6, [sp, #88] //the value of i_lambda
+    //Get the data from stack
+    ldr r5, [sp, #84] //the addr of Best_mode
+    ldr r6, [sp, #88] //the value of i_lambda
 
-	vrshr.u16  d11, #1
-	vpaddl.u16 d11, d11
-	vpaddl.u32 d11, d11
-	vmov.u32   lr, d11[0]
-	add  lr, lr, r6, lsl #1
+    vrshr.u16  d11, #1
+    vpaddl.u16 d11, d11
+    vpaddl.u32 d11, d11
+    vmov.u32   lr, d11[0]
+    add  lr, lr, r6, lsl #1
 
-	vrshr.u16  d10, #1
-	vpaddl.u16 d10, d10
-	vpaddl.u32 d10, d10
-	vmov.u32   r3, d10[0]
-	add  r3, r3, r6, lsl #1
+    vrshr.u16  d10, #1
+    vpaddl.u16 d10, d10
+    vpaddl.u32 d10, d10
+    vmov.u32   r3, d10[0]
+    add  r3, r3, r6, lsl #1
 
-	vrshr.u16  d28, #1
-	vpaddl.u16 d28, d28
-	vpaddl.u32 d28, d28
-	vmov.u32   r2, d28[0]
+    vrshr.u16  d28, #1
+    vpaddl.u16 d28, d28
+    vpaddl.u32 d28, d28
+    vmov.u32   r2, d28[0]
 
     mov r6, #2
     cmp r3, lr
@@ -671,8 +671,8 @@
     str r6, [r5]
     mov r0, lr
 
-	vpop {q4-q7}
-	ldmia sp!, {r4-r7, lr}
+    vpop {q4-q7}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 
@@ -680,118 +680,118 @@
     stmdb sp!, {r4-r7, lr}
 
     //Get the top line data to 'd31[0~3]'(4 bytes)
-	sub  r7, r0, r1
+    sub  r7, r0, r1
     vld1.32 {d31[0]}, [r7]
 
-	//Get the left colume data to 'd31[4~7]' (4 bytes)
-	sub  r7, r0, #1
+    //Get the left colume data to 'd31[4~7]' (4 bytes)
+    sub  r7, r0, #1
     vld1.8 {d31[4]}, [r7], r1
     vld1.8 {d31[5]}, [r7], r1
     vld1.8 {d31[6]}, [r7], r1
     vld1.8 {d31[7]}, [r7], r1
 
-	//Calculate the mean value and save to 'd30' (2 bytes)
-	vpaddl.u8 d0, d31
-	vpaddl.u16 d0, d0
-	vpaddl.u32 d0, d0
-	//Calculate the mean value
-	vrshr.u16  d0, #3
-	vshl.u16   d30, d0, #4
+    //Calculate the mean value and save to 'd30' (2 bytes)
+    vpaddl.u8 d0, d31
+    vpaddl.u16 d0, d0
+    vpaddl.u32 d0, d0
+    //Calculate the mean value
+    vrshr.u16  d0, #3
+    vshl.u16   d30, d0, #4
 
-	//Calculate the 16x16_v mode SATD and save to "d29"
+    //Calculate the 16x16_v mode SATD and save to "d29"
     //Calculate the 16x16_h mode SATD and save to "d28"
-	vshll.u8 q0, d31, #2
-	vtrn.32  d0, d1
-	vadd.s16 d2, d0, d1
-	vsub.s16 d1, d0, d1
-	vtrn.16  d2, d1
-	vadd.s16 d29, d2, d1
-	vsub.s16 d28, d2, d1
-	vtrn.32  d29, d28 //{0,1,3,2 top} d29
-	                  //{0,1,3,2 left} d28
+    vshll.u8 q0, d31, #2
+    vtrn.32  d0, d1
+    vadd.s16 d2, d0, d1
+    vsub.s16 d1, d0, d1
+    vtrn.16  d2, d1
+    vadd.s16 d29, d2, d1
+    vsub.s16 d28, d2, d1
+    vtrn.32  d29, d28 //{0,1,3,2 top} d29
+                      //{0,1,3,2 left} d28
 
     vmov.i32 d27, #0//Save the SATD of DC_BOTH
-	vmov.i32 d26, #0//Save the SATD of H
-	vmov.i32 d25, #0//Save the SATD of V
-	vmov.i32 d24, #0//For zero D register
+    vmov.i32 d26, #0//Save the SATD of H
+    vmov.i32 d25, #0//Save the SATD of V
+    vmov.i32 d24, #0//For zero D register
 
-	//Load the p_enc data and save to "d22,d23"--- 4X4 bytes
-	vld1.32  {d23[0]}, [r2], r3
-	vld1.32  {d23[1]}, [r2], r3
-	vld1.32  {d22[0]}, [r2], r3
-	vld1.32  {d22[1]}, [r2], r3
+    //Load the p_enc data and save to "d22,d23"--- 4X4 bytes
+    vld1.32  {d23[0]}, [r2], r3
+    vld1.32  {d23[1]}, [r2], r3
+    vld1.32  {d22[0]}, [r2], r3
+    vld1.32  {d22[1]}, [r2], r3
 
     HDM_TRANSFORM_4X4_L0 d23, d22, d29, d28, d30, d25, d26, d27, d24
 
-	//Get the data from stack
-	ldr r5, [sp, #28] //the value of lambda2
-	ldr r6, [sp, #32] //the value of lambda1
-	ldr r7, [sp, #36] //the value of lambda0
+    //Get the data from stack
+    ldr r5, [sp, #28] //the value of lambda2
+    ldr r6, [sp, #32] //the value of lambda1
+    ldr r7, [sp, #36] //the value of lambda0
 
-	vrshr.u16  d25, #1
-	vpaddl.u16 d25, d25
-	vpaddl.u32 d25, d25
-	vmov.u32   r0, d25[0]
-	add  r0, r7
+    vrshr.u16  d25, #1
+    vpaddl.u16 d25, d25
+    vpaddl.u32 d25, d25
+    vmov.u32   r0, d25[0]
+    add  r0, r7
 
-	vrshr.u16  d26, #1
-	vpaddl.u16 d26, d26
-	vpaddl.u32 d26, d26
-	vmov.u32   r1, d26[0]
-	add  r1, r6
+    vrshr.u16  d26, #1
+    vpaddl.u16 d26, d26
+    vpaddl.u32 d26, d26
+    vmov.u32   r1, d26[0]
+    add  r1, r6
 
-	vrshr.u16  d27, #1
-	vpaddl.u16 d27, d27
-	vpaddl.u32 d27, d27
-	vmov.u32   r2, d27[0]
-	add  r2, r5
+    vrshr.u16  d27, #1
+    vpaddl.u16 d27, d27
+    vpaddl.u32 d27, d27
+    vmov.u32   r2, d27[0]
+    add  r2, r5
 
-	ldr r5, [sp, #20] //p_dst
-	ldr r6, [sp, #24] //the addr of Best_mode
+    ldr r5, [sp, #20] //p_dst
+    ldr r6, [sp, #24] //the addr of Best_mode
 
-	mov r4, r0
-	cmp r1, r4
-	movcc r4, r1
-	cmp r2, r4
-	movcc r4, r2
+    mov r4, r0
+    cmp r1, r4
+    movcc r4, r1
+    cmp r2, r4
+    movcc r4, r2
 
-	//The compare sequence affect the resule
-	cmp r4, r2
-	bne satd_intra_4x4_x3_opt_jump0
-	mov r0, #2
-	str r0, [r6]
-	vshr.u32  d0, d30, #4 // {2cb, 2cr} q9
-	vdup.8 q1, d0[0]
-	vst1.8 {q1}, [r5]
-	//...
-	bl satd_intra_4x4_x3_opt_end
+    //The compare sequence affect the resule
+    cmp r4, r2
+    bne satd_intra_4x4_x3_opt_jump0
+    mov r0, #2
+    str r0, [r6]
+    vshr.u32  d0, d30, #4 // {2cb, 2cr} q9
+    vdup.8 q1, d0[0]
+    vst1.8 {q1}, [r5]
+    //...
+    bl satd_intra_4x4_x3_opt_end
 satd_intra_4x4_x3_opt_jump0:
 
-	cmp r4, r1
-	bne satd_intra_4x4_x3_opt_jump1
-	mov r0, #1
-	str r0, [r6]
-	vdup.8 d0, d31[4]
-	vdup.8 d1, d31[5]
-	vdup.8 d2, d31[6]
-	vdup.8 d3, d31[7]
-	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
+    cmp r4, r1
+    bne satd_intra_4x4_x3_opt_jump1
+    mov r0, #1
+    str r0, [r6]
+    vdup.8 d0, d31[4]
+    vdup.8 d1, d31[5]
+    vdup.8 d2, d31[6]
+    vdup.8 d3, d31[7]
+    vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]
 
-	bl satd_intra_4x4_x3_opt_end
+    bl satd_intra_4x4_x3_opt_end
 satd_intra_4x4_x3_opt_jump1:
 
-	mov r0, #0
-	str r0, [r6]
-	vst1.32 {d31[0]}, [r5]!
-	vst1.32 {d31[0]}, [r5]!
-	vst1.32 {d31[0]}, [r5]!
-	vst1.32 {d31[0]}, [r5]!
+    mov r0, #0
+    str r0, [r6]
+    vst1.32 {d31[0]}, [r5]!
+    vst1.32 {d31[0]}, [r5]!
+    vst1.32 {d31[0]}, [r5]!
+    vst1.32 {d31[0]}, [r5]!
 
 
 satd_intra_4x4_x3_opt_end:
-	mov r0, r4
+    mov r0, r4
 
-	ldmia sp!, {r4-r7, lr}
+    ldmia sp!, {r4-r7, lr}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/encoder/core/arm/pixel_neon.S
+++ b/codec/encoder/core/arm/pixel_neon.S
@@ -66,10 +66,10 @@
     vsub.s16    q3, q12, q13
 
     vadd.s16    q8, q10, q11
-    vsub.s16	q9, q10, q11
+    vsub.s16    q9, q10, q11
 
     vadd.s16    q10, q14, q15
-    vsub.s16	q11, q14, q15
+    vsub.s16    q11, q14, q15
 
     vadd.s16    q12, q0, q2
     vsub.s16    q14, q0, q2
@@ -372,28 +372,28 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
     stmdb sp!, {r4-r5, lr}
 
-	//Loading a horizontal line data (4 bytes)
-	//line 0
-	ldr r4, [r0], r1
-	ldr r5, [r2], r3
-	usad8  lr, r4, r5
+    //Loading a horizontal line data (4 bytes)
+    //line 0
+    ldr r4, [r0], r1
+    ldr r5, [r2], r3
+    usad8  lr, r4, r5
 
     //line 1
-	ldr r4, [r0], r1
-	ldr r5, [r2], r3
-	usada8  lr, r4, r5, lr
+    ldr r4, [r0], r1
+    ldr r5, [r2], r3
+    usada8  lr, r4, r5, lr
 
     //line 2
-	ldr r4, [r0], r1
-	ldr r5, [r2], r3
-	usada8  lr, r4, r5, lr
+    ldr r4, [r0], r1
+    ldr r5, [r2], r3
+    usada8  lr, r4, r5, lr
 
-	//line 3
-	ldr r4, [r0]
-	ldr r5, [r2]
-	usada8  r0, r4, r5, lr
+    //line 3
+    ldr r4, [r0]
+    ldr r5, [r2]
+    usada8  r0, r4, r5, lr
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
@@ -401,76 +401,76 @@
 
     stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
+    vld1.8 {q0}, [r0], r1 //save pix1
 
-	vld1.8 {q1}, [r2], r3 //save pix2 - stride
-	vld1.8 {q10}, [r2], r3 //save pix2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vld1.8 {q1}, [r2], r3 //save pix2 - stride
+    vld1.8 {q10}, [r2], r3 //save pix2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 16 bytes
-	vabdl.u8  q15, d0, d2
-	vabal.u8  q15, d1, d3
+    //Do the SAD for 16 bytes
+    vabdl.u8  q15, d0, d2
+    vabal.u8  q15, d1, d3
 
-	vabdl.u8  q13, d0, d4
-	vabal.u8  q13, d1, d5
+    vabdl.u8  q13, d0, d4
+    vabal.u8  q13, d1, d5
 
-	vabdl.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabdl.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabdl.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabdl.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	mov lr, #15
+    mov lr, #15
 pixel_sad_4_16x16_loop_0:
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
-	vmov.8 q1,   q10      //save pix2 - stride
-	vmov.8 q10,  q2
-	vabal.u8  q15, d0, d2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
-	vabal.u8  q15, d1, d3
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q13, d0, d4
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q0}, [r0], r1 //save pix1
+    vmov.8 q1,   q10      //save pix2 - stride
+    vmov.8 q10,  q2
+    vabal.u8  q15, d0, d2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vabal.u8  q15, d1, d3
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q13, d0, d4
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
     vabal.u8  q13, d1, d5
-	subs lr, #1
+    subs lr, #1
 
-	vabal.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabal.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabal.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabal.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	bne pixel_sad_4_16x16_loop_0
+    bne pixel_sad_4_16x16_loop_0
 
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d26, d27
-	vadd.u16   d2, d22, d23
-	vadd.u16   d3, d18, d19
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d26, d27
+    vadd.u16   d2, d22, d23
+    vadd.u16   d3, d18, d19
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
@@ -477,75 +477,75 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
     stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
+    vld1.8 {q0}, [r0], r1 //save pix1
 
-	vld1.8 {q1}, [r2], r3 //save pix2 - stride
-	vld1.8 {q10}, [r2], r3 //save pix2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vld1.8 {q1}, [r2], r3 //save pix2 - stride
+    vld1.8 {q10}, [r2], r3 //save pix2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 16 bytes
-	vabdl.u8  q15, d0, d2
-	vabal.u8  q15, d1, d3
+    //Do the SAD for 16 bytes
+    vabdl.u8  q15, d0, d2
+    vabal.u8  q15, d1, d3
 
-	vabdl.u8  q13, d0, d4
-	vabal.u8  q13, d1, d5
+    vabdl.u8  q13, d0, d4
+    vabal.u8  q13, d1, d5
 
-	vabdl.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabdl.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabdl.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabdl.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	mov lr, #7
+    mov lr, #7
 pixel_sad_4_16x8_loop_0:
 
     //Loading a horizontal line data (16 bytes)
-	vld1.8 {q0}, [r0], r1 //save pix1
-	vmov.8 q1,   q10      //save pix2 - stride
-	vmov.8 q10,  q2
-	vabal.u8  q15, d0, d2
-	vld1.8 {q2}, [r2], r3 //save pix2 + stride
-	vabal.u8  q15, d1, d3
-	vld1.8 {q3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q13, d0, d4
-	vld1.8 {q8}, [r5], r3 //save pix2 + 1
+    vld1.8 {q0}, [r0], r1 //save pix1
+    vmov.8 q1,   q10      //save pix2 - stride
+    vmov.8 q10,  q2
+    vabal.u8  q15, d0, d2
+    vld1.8 {q2}, [r2], r3 //save pix2 + stride
+    vabal.u8  q15, d1, d3
+    vld1.8 {q3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q13, d0, d4
+    vld1.8 {q8}, [r5], r3 //save pix2 + 1
     vabal.u8  q13, d1, d5
-	subs lr, #1
+    subs lr, #1
 
-	vabal.u8  q11, d0, d6
-	vabal.u8  q11, d1, d7
+    vabal.u8  q11, d0, d6
+    vabal.u8  q11, d1, d7
 
-	vabal.u8  q9, d0, d16
-	vabal.u8  q9, d1, d17
+    vabal.u8  q9, d0, d16
+    vabal.u8  q9, d1, d17
 
-	bne pixel_sad_4_16x8_loop_0
+    bne pixel_sad_4_16x8_loop_0
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d26, d27
-	vadd.u16   d2, d22, d23
-	vadd.u16   d3, d18, d19
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d26, d27
+    vadd.u16   d2, d22, d23
+    vadd.u16   d3, d18, d19
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
@@ -552,189 +552,189 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
     stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
+    vld1.8 {d0}, [r0], r1 //save pix1
 
-	vld1.8 {d1}, [r2], r3 //save pix2 - stride
-	vld1.8 {d6}, [r2], r3 //save pix2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d1}, [r2], r3 //save pix2 - stride
+    vld1.8 {d6}, [r2], r3 //save pix2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 8 bytes
-	vabdl.u8  q15, d0, d1
-	vabdl.u8  q14, d0, d2
-	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d0, d4
+    //Do the SAD for 8 bytes
+    vabdl.u8  q15, d0, d1
+    vabdl.u8  q14, d0, d2
+    vabdl.u8  q13, d0, d3
+    vabdl.u8  q12, d0, d4
 
-	mov lr, #15
+    mov lr, #15
 pixel_sad_4_8x16_loop_0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
-	vmov.8 d1,   d6       //save pix2 - stride
-	vmov.8 d6,   d2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q15, d0, d1
+    vld1.8 {d0}, [r0], r1 //save pix1
+    vmov.8 d1,   d6       //save pix2 - stride
+    vmov.8 d6,   d2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q15, d0, d1
 
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
-	//Do the SAD for 8 bytes
-	vabal.u8  q14, d0, d2
-	vabal.u8  q13, d0, d3
-	vabal.u8  q12, d0, d4
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    //Do the SAD for 8 bytes
+    vabal.u8  q14, d0, d2
+    vabal.u8  q13, d0, d3
+    vabal.u8  q12, d0, d4
     subs lr, #1
 
-	bne pixel_sad_4_8x16_loop_0
+    bne pixel_sad_4_8x16_loop_0
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d28, d29
-	vadd.u16   d2, d26, d27
-	vadd.u16   d3, d24, d25
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d28, d29
+    vadd.u16   d2, d26, d27
+    vadd.u16   d3, d24, d25
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
-	stmdb sp!, {r4-r5, lr}
+    stmdb sp!, {r4-r5, lr}
 
-	//Generate the pix2 start addr
-	sub   r4, r2, #1
-	add   r5, r2, #1
-	sub   r2, r3
+    //Generate the pix2 start addr
+    sub   r4, r2, #1
+    add   r5, r2, #1
+    sub   r2, r3
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
+    vld1.8 {d0}, [r0], r1 //save pix1
 
-	vld1.8 {d1}, [r2], r3 //save pix2 - stride
-	vld1.8 {d6}, [r2], r3 //save pix2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d1}, [r2], r3 //save pix2 - stride
+    vld1.8 {d6}, [r2], r3 //save pix2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
 
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
 
-	//Do the SAD for 8 bytes
-	vabdl.u8  q15, d0, d1
-	vabdl.u8  q14, d0, d2
-	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d0, d4
+    //Do the SAD for 8 bytes
+    vabdl.u8  q15, d0, d1
+    vabdl.u8  q14, d0, d2
+    vabdl.u8  q13, d0, d3
+    vabdl.u8  q12, d0, d4
 
-	mov lr, #7
+    mov lr, #7
 pixel_sad_4_8x8_loop_0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1 //save pix1
-	vmov.8 d1,   d6       //save pix2 - stride
-	vmov.8 d6,   d2
-	vld1.8 {d2}, [r2], r3 //save pix2 + stride
-	vld1.8 {d3}, [r4], r3 //save pix2 - 1
-	vabal.u8  q15, d0, d1
+    vld1.8 {d0}, [r0], r1 //save pix1
+    vmov.8 d1,   d6       //save pix2 - stride
+    vmov.8 d6,   d2
+    vld1.8 {d2}, [r2], r3 //save pix2 + stride
+    vld1.8 {d3}, [r4], r3 //save pix2 - 1
+    vabal.u8  q15, d0, d1
 
-	vld1.8 {d4}, [r5], r3 //save pix2 + 1
-	//Do the SAD for 8 bytes
-	vabal.u8  q14, d0, d2
-	vabal.u8  q13, d0, d3
-	vabal.u8  q12, d0, d4
+    vld1.8 {d4}, [r5], r3 //save pix2 + 1
+    //Do the SAD for 8 bytes
+    vabal.u8  q14, d0, d2
+    vabal.u8  q13, d0, d3
+    vabal.u8  q12, d0, d4
     subs lr, #1
-	bne pixel_sad_4_8x8_loop_0
+    bne pixel_sad_4_8x8_loop_0
 
     //Save SAD to 'r0'
-	ldr   r0, [sp, #12]
+    ldr   r0, [sp, #12]
 
-	vadd.u16   d0, d30, d31
-	vadd.u16   d1, d28, d29
-	vadd.u16   d2, d26, d27
-	vadd.u16   d3, d24, d25
+    vadd.u16   d0, d30, d31
+    vadd.u16   d1, d28, d29
+    vadd.u16   d2, d26, d27
+    vadd.u16   d3, d24, d25
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
-	ldmia sp!, {r4-r5, lr}
+    ldmia sp!, {r4-r5, lr}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
 
-	vld1.32  {d0[0]}, [r0], r1
-	vld1.32  {d0[1]}, [r0], r1
-	vld1.32  {d1[0]}, [r0], r1
-	vld1.32  {d1[1]}, [r0]
+    vld1.32  {d0[0]}, [r0], r1
+    vld1.32  {d0[1]}, [r0], r1
+    vld1.32  {d1[0]}, [r0], r1
+    vld1.32  {d1[1]}, [r0]
 
 
-	sub   r0, r2, r3
-	vld1.32  {d2[0]}, [r0], r3
-	vld1.32  {d2[1]}, [r0], r3
-	vld1.32  {d3[0]}, [r0], r3
-	vld1.32  {d3[1]}, [r0], r3
-	vld1.32  {d4[0]}, [r0], r3
-	vld1.32  {d4[1]}, [r0]
+    sub   r0, r2, r3
+    vld1.32  {d2[0]}, [r0], r3
+    vld1.32  {d2[1]}, [r0], r3
+    vld1.32  {d3[0]}, [r0], r3
+    vld1.32  {d3[1]}, [r0], r3
+    vld1.32  {d4[0]}, [r0], r3
+    vld1.32  {d4[1]}, [r0]
 
-	sub   r0,  r2, #1
-	vld1.32  {d5[0]}, [r0], r3
-	vld1.32  {d5[1]}, [r0], r3
-	vld1.32  {d6[0]}, [r0], r3
-	vld1.32  {d6[1]}, [r0]
+    sub   r0,  r2, #1
+    vld1.32  {d5[0]}, [r0], r3
+    vld1.32  {d5[1]}, [r0], r3
+    vld1.32  {d6[0]}, [r0], r3
+    vld1.32  {d6[1]}, [r0]
 
-	add   r0,  r2, #1
-	vld1.32  {d7[0]}, [r0], r3
-	vld1.32  {d7[1]}, [r0], r3
-	vld1.32  {d8[0]}, [r0], r3
-	vld1.32  {d8[1]}, [r0]
+    add   r0,  r2, #1
+    vld1.32  {d7[0]}, [r0], r3
+    vld1.32  {d7[1]}, [r0], r3
+    vld1.32  {d8[0]}, [r0], r3
+    vld1.32  {d8[1]}, [r0]
 
-	vabdl.u8  q15, d0, d2
-	vabdl.u8  q14, d1, d3
+    vabdl.u8  q15, d0, d2
+    vabdl.u8  q14, d1, d3
 
-	vabdl.u8  q13, d0, d3
-	vabdl.u8  q12, d1, d4
+    vabdl.u8  q13, d0, d3
+    vabdl.u8  q12, d1, d4
 
-	vabdl.u8  q11, d0, d5
-	vabdl.u8  q10, d1, d6
+    vabdl.u8  q11, d0, d5
+    vabdl.u8  q10, d1, d6
 
-	vabdl.u8  q9, d0, d7
-	vabdl.u8  q8, d1, d8
+    vabdl.u8  q9, d0, d7
+    vabdl.u8  q8, d1, d8
 
-	//Save SAD to 'r4'
-	ldr   r0, [sp]
-	vadd.u16   q0, q14, q15
-	vadd.u16   q1, q12, q13
-	vadd.u16   q2, q10, q11
-	vadd.u16   q3, q8 , q9
+    //Save SAD to 'r4'
+    ldr   r0, [sp]
+    vadd.u16   q0, q14, q15
+    vadd.u16   q1, q12, q13
+    vadd.u16   q2, q10, q11
+    vadd.u16   q3, q8 , q9
 
-	vadd.u16   d0, d1
-	vadd.u16   d1, d2, d3
-	vadd.u16   d2, d4, d5
-	vadd.u16   d3, d6, d7
+    vadd.u16   d0, d1
+    vadd.u16   d1, d2, d3
+    vadd.u16   d2, d4, d5
+    vadd.u16   d3, d6, d7
 
-	vpaddl.u16 q0, q0
-	vpaddl.u16 q1, q1
+    vpaddl.u16 q0, q0
+    vpaddl.u16 q1, q1
 
-	vpaddl.u32 q0, q0
-	vpaddl.u32 q1, q1
+    vpaddl.u32 q0, q0
+    vpaddl.u32 q1, q1
 
-	vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
+    vst4.32    {d0[0],d1[0],d2[0],d3[0]}, [r0]
 
 WELS_ASM_FUNC_END
 
@@ -834,16 +834,16 @@
 WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
 
     //Load the pix1 data --- 16 bytes
-	vld1.32  {d0[0]}, [r0], r1
-	vld1.32  {d0[1]}, [r0], r1
-	vld1.32  {d1[0]}, [r0], r1
-	vld1.32  {d1[1]}, [r0]
+    vld1.32  {d0[0]}, [r0], r1
+    vld1.32  {d0[1]}, [r0], r1
+    vld1.32  {d1[0]}, [r0], r1
+    vld1.32  {d1[1]}, [r0]
 
     //Load the pix2 data --- 16 bytes
-	vld1.32  {d2[0]}, [r2], r3
-	vld1.32  {d2[1]}, [r2], r3
-	vld1.32  {d3[0]}, [r2], r3
-	vld1.32  {d3[1]}, [r2]
+    vld1.32  {d2[0]}, [r2], r3
+    vld1.32  {d2[1]}, [r2], r3
+    vld1.32  {d3[0]}, [r2], r3
+    vld1.32  {d3[1]}, [r2]
 
     //Get the difference
     vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
@@ -874,7 +874,7 @@
     vpaddl.u16 d0, d0
     vpaddl.u32 d0, d0
 
-	vmov.u32   r0, d0[0]
+    vmov.u32   r0, d0[0]
 
 WELS_ASM_FUNC_END
 
--- a/codec/encoder/core/arm/reconstruct_neon.S
+++ b/codec/encoder/core/arm/reconstruct_neon.S
@@ -35,592 +35,592 @@
 #include "arm_arch_common_macro.S"
 
 #ifdef __APPLE__
-.macro	LOAD_4x4_DATA_FOR_DCT
-//	{	//	input: $0~$3, src1*, src1_stride, src2*, src2_stride
-    vld2.16	{$0[0],$1[0]}, [$4], $5
-    vld2.16	{$2[0],$3[0]}, [$6], $7
-    vld2.16	{$0[1],$1[1]}, [$4], $5
-    vld2.16	{$2[1],$3[1]}, [$6], $7
+.macro LOAD_4x4_DATA_FOR_DCT
+//  {   //  input: $0~$3, src1*, src1_stride, src2*, src2_stride
+    vld2.16 {$0[0],$1[0]}, [$4], $5
+    vld2.16 {$2[0],$3[0]}, [$6], $7
+    vld2.16 {$0[1],$1[1]}, [$4], $5
+    vld2.16 {$2[1],$3[1]}, [$6], $7
 
-    vld2.16	{$0[2],$1[2]}, [$4], $5
-    vld2.16	{$2[2],$3[2]}, [$6], $7
-    vld2.16	{$0[3],$1[3]}, [$4], $5
-    vld2.16	{$2[3],$3[3]}, [$6], $7
-//	}
+    vld2.16 {$0[2],$1[2]}, [$4], $5
+    vld2.16 {$2[2],$3[2]}, [$6], $7
+    vld2.16 {$0[3],$1[3]}, [$4], $5
+    vld2.16 {$2[3],$3[3]}, [$6], $7
+//  }
 .endm
 
-.macro	LOAD_8x8_DATA_FOR_DCT
-//	{	//	input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-    vld1.64	{$0}, [$8], r2
-    vld1.64	{$4}, [$9], r4
-    vld1.64	{$1}, [$8], r2
-    vld1.64	{$5}, [$9], r4
+.macro LOAD_8x8_DATA_FOR_DCT
+//  {   //  input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+    vld1.64 {$0}, [$8], r2
+    vld1.64 {$4}, [$9], r4
+    vld1.64 {$1}, [$8], r2
+    vld1.64 {$5}, [$9], r4
 
-    vld1.64	{$2}, [$8], r2
-    vld1.64	{$6}, [$9], r4
-    vld1.64	{$3}, [$8], r2
-    vld1.64	{$7}, [$9], r4
-//	}
+    vld1.64 {$2}, [$8], r2
+    vld1.64 {$6}, [$9], r4
+    vld1.64 {$3}, [$8], r2
+    vld1.64 {$7}, [$9], r4
+//  }
 .endm
 
-.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], working: [4]~[7]
-    vadd.s16		$4, $0, $3			//int16 s[0] = data[i] + data[i3];
-    vsub.s16		$7, $0, $3			//int16 s[3] = data[i] - data[i3];
-    vadd.s16		$5, $1, $2			//int16 s[1] = data[i1] + data[i2];
-    vsub.s16		$6, $1, $2			//int16 s[2] = data[i1] - data[i2];
+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS
+//  {   //  input: src_d[0]~[3], working: [4]~[7]
+    vadd.s16        $4, $0, $3          //int16 s[0] = data[i] + data[i3];
+    vsub.s16        $7, $0, $3          //int16 s[3] = data[i] - data[i3];
+    vadd.s16        $5, $1, $2          //int16 s[1] = data[i1] + data[i2];
+    vsub.s16        $6, $1, $2          //int16 s[2] = data[i1] - data[i2];
 
-    vadd.s16		$0, $4, $5			//int16 dct[i ] = s[0] + s[1];
-    vsub.s16		$2, $4, $5			//int16 dct[i2] = s[0] - s[1];
-    vshl.s16		$1, $7, #1
-    vshl.s16		$3, $6, #1
-    vadd.s16		$1, $1, $6			//int16 dct[i1] = (s[3] << 1) + s[2];
-    vsub.s16		$3, $7, $3			//int16 dct[i3] = s[3] - (s[2] << 1);
-//	}
+    vadd.s16        $0, $4, $5          //int16 dct[i ] = s[0] + s[1];
+    vsub.s16        $2, $4, $5          //int16 dct[i2] = s[0] - s[1];
+    vshl.s16        $1, $7, #1
+    vshl.s16        $3, $6, #1
+    vadd.s16        $1, $1, $6          //int16 dct[i1] = (s[3] << 1) + s[2];
+    vsub.s16        $3, $7, $3          //int16 dct[i3] = s[3] - (s[2] << 1);
+//  }
 .endm
 
-.macro	MATRIX_TRANSFORM_EACH_16BITS
-//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-    vtrn.s16		$0, $1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-    vtrn.s16		$2, $3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-    vtrn.32		$0, $2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-    vtrn.32		$1, $3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//	}
+.macro MATRIX_TRANSFORM_EACH_16BITS
+//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+    vtrn.s16        $0, $1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s16        $2, $3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vtrn.32     $0, $2              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vtrn.32     $1, $3              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+//  }
 .endm
 
-.macro	NEWQUANT_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-    veor.s16		$6, $6			// init 0 , and keep 0;
-    vaba.s16		$1, $0, $6		// f + abs(coef - 0)
-    vmull.s16		$7, $2, $4
-    vmull.s16		$8, $3, $5
-    vshr.s32		$7, #16
-    vshr.s32		$8, #16
-    vmovn.s32		$2, $7
-    vmovn.s32		$3, $8
+.macro NEWQUANT_COEF_EACH_16BITS    // if coef <= 0, - coef; else , coef;
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+    veor.s16        $6, $6          // init 0 , and keep 0;
+    vaba.s16        $1, $0, $6      // f + abs(coef - 0)
+    vmull.s16       $7, $2, $4
+    vmull.s16       $8, $3, $5
+    vshr.s32        $7, #16
+    vshr.s32        $8, #16
+    vmovn.s32       $2, $7
+    vmovn.s32       $3, $8
 
-    vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
-    vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		$6, #1
-    vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        $7, $0, #0      // if true, location of coef == 11111111
+    vbif.s16        $6, $1, $7      // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        $6, #1
+    vsub.s16        $1, $1, $6      // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro	NEWQUANT_COEF_EACH_16BITS_MAX	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-    veor.s16		$6, $6			// init 0 , and keep 0;
-    vaba.s16		$1, $0, $6		// f + abs(coef - 0)
-    vmull.s16		$7, $2, $4
-    vmull.s16		$8, $3, $5
-    vshr.s32		$7, #16
-    vshr.s32		$8, #16
-    vmovn.s32		$2, $7
-    vmovn.s32		$3, $8
+.macro NEWQUANT_COEF_EACH_16BITS_MAX    // if coef <= 0, - coef; else , coef;
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+    veor.s16        $6, $6          // init 0 , and keep 0;
+    vaba.s16        $1, $0, $6      // f + abs(coef - 0)
+    vmull.s16       $7, $2, $4
+    vmull.s16       $8, $3, $5
+    vshr.s32        $7, #16
+    vshr.s32        $8, #16
+    vmovn.s32       $2, $7
+    vmovn.s32       $3, $8
 
-    vcgt.s16		$7, $0, #0		// if true, location of coef == 11111111
-    vbif.s16		$6, $1, $7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		$6, #1
-    vmax.s16		$9, $2, $3
-    vsub.s16		$1, $1, $6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        $7, $0, #0      // if true, location of coef == 11111111
+    vbif.s16        $6, $1, $7      // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        $6, #1
+    vmax.s16        $9, $2, $3
+    vsub.s16        $1, $1, $6      // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro	QUANT_DUALWORD_COEF_EACH_16BITS	// if coef <= 0, - coef; else , coef;
-//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
-    vaba.s16		$1, $0, $3		// f + abs(coef - 0)
-    vmull.s16		$4, $1, $2		// *= mf
-    vshr.s32		$4, #16
-    vmovn.s32		$1, $4			// >> 16
+.macro QUANT_DUALWORD_COEF_EACH_16BITS  // if coef <= 0, - coef; else , coef;
+//  {   //  input:  coef, ff (dst), mf , working_d (all 0), working_q
+    vaba.s16        $1, $0, $3      // f + abs(coef - 0)
+    vmull.s16       $4, $1, $2      // *= mf
+    vshr.s32        $4, #16
+    vmovn.s32       $1, $4          // >> 16
 
-    vcgt.s16		$2, $0, #0		// if true, location of coef == 11111111
-    vbif.s16		$3, $1, $2		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		$3, #1
-    vsub.s16		$1, $1, $3		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        $2, $0, #0      // if true, location of coef == 11111111
+    vbif.s16        $3, $1, $2      // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        $3, #1
+    vsub.s16        $1, $1, $3      // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro	DC_ZERO_COUNT_IN_DUALWORD
-//	{	//	input:	coef, dst_d, working_d (all 0x01)
-    vceq.s16	$1, $0, #0
-    vand.s16	$1, $2
-    vpadd.s16	$1, $1, $1
-    vpadd.s16	$1, $1, $1
-//	}
+.macro DC_ZERO_COUNT_IN_DUALWORD
+//  {   //  input:  coef, dst_d, working_d (all 0x01)
+    vceq.s16    $1, $0, #0
+    vand.s16    $1, $2
+    vpadd.s16   $1, $1, $1
+    vpadd.s16   $1, $1, $1
+//  }
 .endm
 
-.macro	SELECT_MAX_IN_ABS_COEF
-//	{	//	input:	coef_0, coef_1, max_q (identy to follow two)
-    vmax.s16		$2, $0, $1		// max 1st in $3 & max 2nd in $4
-    vpmax.s16		$3, $3, $4		// max 1st in $3[0][1] & max 2nd in $3[2][3]
-    vpmax.s16		$3, $3, $4		// max 1st in $3[0][1]
-//	}
+.macro SELECT_MAX_IN_ABS_COEF
+//  {   //  input:  coef_0, coef_1, max_q (identy to follow two)
+    vmax.s16        $2, $0, $1      // max 1st in $3 & max 2nd in $4
+    vpmax.s16       $3, $3, $4      // max 1st in $3[0][1] & max 2nd in $3[2][3]
+    vpmax.s16       $3, $3, $4      // max 1st in $3[0][1]
+//  }
 .endm
 
-.macro	ZERO_COUNT_IN_2_QUARWORD
-//	{	//	input:	coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
-    vceq.s16	$0, #0
-    vceq.s16	$1, #0
-    vand.s16	$0, $2
-    vand.s16	$1, $2
+.macro ZERO_COUNT_IN_2_QUARWORD
+//  {   //  input:  coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q
+    vceq.s16    $0, #0
+    vceq.s16    $1, #0
+    vand.s16    $0, $2
+    vand.s16    $1, $2
 
-    vpadd.s16	$3, $3, $5
-    vpadd.s16	$4, $4, $6
-    vpadd.s16	$3, $3, $4		// 8-->4
-    vpadd.s16	$3, $3, $3
-    vpadd.s16	$3, $3, $3
-//	}
+    vpadd.s16   $3, $3, $5
+    vpadd.s16   $4, $4, $6
+    vpadd.s16   $3, $3, $4      // 8-->4
+    vpadd.s16   $3, $3, $3
+    vpadd.s16   $3, $3, $3
+//  }
 .endm
 
-.macro	HDM_QUANT_2x2_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], working_d, dst_d
-    vshr.s64	$1, $0, #32
-    vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-    vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-    vtrn.s16	$2, $1
-    vtrn.s32	$2, $1
-//	}
+.macro HDM_QUANT_2x2_TOTAL_16BITS
+//  {   //  input: src_d[0]~[3], working_d, dst_d
+    vshr.s64    $1, $0, #32
+    vadd.s16    $2, $0, $1      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+    vsub.s16    $1, $0, $1      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+    vtrn.s16    $2, $1
+    vtrn.s32    $2, $1
+//  }
 .endm
 
-.macro	IHDM_4x4_TOTAL_16BITS
-//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-    vshr.s64	$1, $0, #32
-    vadd.s16	$2, $0, $1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-    vsub.s16	$1, $0, $1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-    vtrn.s16	$2, $1
-    vrev32.16	$1, $1
-    vtrn.s32	$2, $1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+.macro IHDM_4x4_TOTAL_16BITS
+//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+    vshr.s64    $1, $0, #32
+    vadd.s16    $2, $0, $1      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+    vsub.s16    $1, $0, $1      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+    vtrn.s16    $2, $1
+    vrev32.16   $1, $1
+    vtrn.s32    $2, $1          // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
 
-    vrev64.16	$1, $2
-    vadd.s16	$0, $2, $1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-    vsub.s16	$1, $2, $1
-    vrev32.16	$1, $1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-    vtrn.s32	$0, $1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//	}
+    vrev64.16   $1, $2
+    vadd.s16    $0, $2, $1      // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+    vsub.s16    $1, $2, $1
+    vrev32.16   $1, $1          // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+    vtrn.s32    $0, $1          // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+//  }
 .endm
 
-.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
-//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-    vmovl.u8		$4,$0
-    vmovl.u8		$5,$1
-    vadd.s16		$4,$2
-    vadd.s16		$5,$3
-    vqmovun.s16	$0,$4
-    vqmovun.s16	$1,$5
-//	}
+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP
+//  {   //  input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+    vmovl.u8        $4,$0
+    vmovl.u8        $5,$1
+    vadd.s16        $4,$2
+    vadd.s16        $5,$3
+    vqmovun.s16 $0,$4
+    vqmovun.s16 $1,$5
+//  }
 .endm
 
-.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS
-//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
-    vadd.s16		$4, $0, $2			//int16 e[i][0] = src[0] + src[2];
-    vsub.s16		$5, $0, $2			//int16 e[i][1] = src[0] - src[2];
-    vshr.s16		$6, $1, #1
-    vshr.s16		$7, $3, #1
-    vsub.s16		$6, $6, $3			//int16 e[i][2] = (src[1]>>1)-src[3];
-    vadd.s16		$7, $1, $7			//int16 e[i][3] = src[1] + (src[3]>>1);
-//	}
+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS
+//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
+    vadd.s16        $4, $0, $2          //int16 e[i][0] = src[0] + src[2];
+    vsub.s16        $5, $0, $2          //int16 e[i][1] = src[0] - src[2];
+    vshr.s16        $6, $1, #1
+    vshr.s16        $7, $3, #1
+    vsub.s16        $6, $6, $3          //int16 e[i][2] = (src[1]>>1)-src[3];
+    vadd.s16        $7, $1, $7          //int16 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro	TRANSFORM_TOTAL_16BITS	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s16		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s16		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s16		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s16		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_TOTAL_16BITS   // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s16        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s16        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s16        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s16        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 
-.macro	ROW_TRANSFORM_0_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
-    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-    vsubl.s16		$6, $1, $3			//int32 e[i][2] = src[1] - src[3];
-    vaddl.s16		$7, $1, $3			//int32 e[i][3] = src[1] + src[3];
-//	}
+.macro ROW_TRANSFORM_0_STEP
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
+    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
+    vsubl.s16       $6, $1, $3          //int32 e[i][2] = src[1] - src[3];
+    vaddl.s16       $7, $1, $3          //int32 e[i][3] = src[1] + src[3];
+//  }
 .endm
 
-.macro	ROW_TRANSFORM_1_STEP
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
-    vaddl.s16		$4, $0, $2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		$5, $0, $2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		$8, $1, #1
-    vshr.s16		$9, $3, #1
-    vsubl.s16		$6, $8, $3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		$7, $1, $9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+.macro ROW_TRANSFORM_1_STEP
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
+    vaddl.s16       $4, $0, $2          //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       $5, $0, $2          //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        $8, $1, #1
+    vshr.s16        $9, $3, #1
+    vsubl.s16       $6, $8, $3          //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       $7, $1, $9          //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro	TRANSFORM_4BYTES	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		$0, $4, $7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		$1, $5, $6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		$2, $5, $6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		$3, $4, $7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        $0, $4, $7          //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        $1, $5, $6          //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        $2, $5, $6          //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        $3, $4, $7          //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
-.macro	COL_TRANSFORM_0_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vsub.s32		$6, $1, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		$7, $1, $3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+.macro COL_TRANSFORM_0_STEP
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
+    vsub.s32        $6, $1, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        $7, $1, $3          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 
-.macro	COL_TRANSFORM_1_STEP
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		$4, $0, $2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		$5, $0, $2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		$6, $1, #1
-    vshr.s32		$7, $3, #1
-    vsub.s32		$6, $6, $3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		$7, $1, $7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+.macro COL_TRANSFORM_1_STEP
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        $4, $0, $2          //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        $5, $0, $2          //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        $6, $1, #1
+    vshr.s32        $7, $3, #1
+    vsub.s32        $6, $6, $3          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        $7, $1, $7          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 #else
-.macro	LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
-    vld2.16	{\arg0[0],\arg1[0]}, [\arg4], \arg5
-    vld2.16	{\arg2[0],\arg3[0]}, [\arg6], \arg7
-    vld2.16	{\arg0[1],\arg1[1]}, [\arg4], \arg5
-    vld2.16	{\arg2[1],\arg3[1]}, [\arg6], \arg7
+.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
+    vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
+    vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
+    vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
+    vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
 
-    vld2.16	{\arg0[2],\arg1[2]}, [\arg4], \arg5
-    vld2.16	{\arg2[2],\arg3[2]}, [\arg6], \arg7
-    vld2.16	{\arg0[3],\arg1[3]}, [\arg4], \arg5
-    vld2.16	{\arg2[3],\arg3[3]}, [\arg6], \arg7
-//	}
+    vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
+    vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
+    vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
+    vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
+//  }
 .endm
 
-.macro	LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
-    vld1.64	{\arg0}, [\arg8], r2
-    vld1.64	{\arg4}, [\arg9], r4
-    vld1.64	{\arg1}, [\arg8], r2
-    vld1.64	{\arg5}, [\arg9], r4
+.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//  {   //  input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
+    vld1.64 {\arg0}, [\arg8], r2
+    vld1.64 {\arg4}, [\arg9], r4
+    vld1.64 {\arg1}, [\arg8], r2
+    vld1.64 {\arg5}, [\arg9], r4
 
-    vld1.64	{\arg2}, [\arg8], r2
-    vld1.64	{\arg6}, [\arg9], r4
-    vld1.64	{\arg3}, [\arg8], r2
-    vld1.64	{\arg7}, [\arg9], r4
-//	}
+    vld1.64 {\arg2}, [\arg8], r2
+    vld1.64 {\arg6}, [\arg9], r4
+    vld1.64 {\arg3}, [\arg8], r2
+    vld1.64 {\arg7}, [\arg9], r4
+//  }
 .endm
 
-.macro	DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], working: [4]~[7]
-    vadd.s16		\arg4, \arg0, \arg3			//int16 s[0] = data[i] + data[i3];
-    vsub.s16		\arg7, \arg0, \arg3			//int16 s[3] = data[i] - data[i3];
-    vadd.s16		\arg5, \arg1, \arg2			//int16 s[1] = data[i1] + data[i2];
-    vsub.s16		\arg6, \arg1, \arg2			//int16 s[2] = data[i1] - data[i2];
+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: src_d[0]~[3], working: [4]~[7]
+    vadd.s16        \arg4, \arg0, \arg3         //int16 s[0] = data[i] + data[i3];
+    vsub.s16        \arg7, \arg0, \arg3         //int16 s[3] = data[i] - data[i3];
+    vadd.s16        \arg5, \arg1, \arg2         //int16 s[1] = data[i1] + data[i2];
+    vsub.s16        \arg6, \arg1, \arg2         //int16 s[2] = data[i1] - data[i2];
 
-    vadd.s16		\arg0, \arg4, \arg5			//int16 dct[i ] = s[0] + s[1];
-    vsub.s16		\arg2, \arg4, \arg5			//int16 dct[i2] = s[0] - s[1];
-    vshl.s16		\arg1, \arg7, #1
-    vshl.s16		\arg3, \arg6, #1
-    vadd.s16		\arg1, \arg1, \arg6			//int16 dct[i1] = (s[3] << 1) + s[2];
-    vsub.s16		\arg3, \arg7, \arg3			//int16 dct[i3] = s[3] - (s[2] << 1);
-//	}
+    vadd.s16        \arg0, \arg4, \arg5         //int16 dct[i ] = s[0] + s[1];
+    vsub.s16        \arg2, \arg4, \arg5         //int16 dct[i2] = s[0] - s[1];
+    vshl.s16        \arg1, \arg7, #1
+    vshl.s16        \arg3, \arg6, #1
+    vadd.s16        \arg1, \arg1, \arg6         //int16 dct[i1] = (s[3] << 1) + s[2];
+    vsub.s16        \arg3, \arg7, \arg3         //int16 dct[i3] = s[3] - (s[2] << 1);
+//  }
 .endm
 
-.macro	MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
-//	{	//	input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
-    vtrn.s16		\arg0, \arg1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-    vtrn.s16		\arg2, \arg3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-    vtrn.32		\arg0, \arg2				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-    vtrn.32		\arg1, \arg3				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
-//	}
+.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
+//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
+    vtrn.s16        \arg0, \arg1                //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s16        \arg2, \arg3                //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vtrn.32     \arg0, \arg2                //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vtrn.32     \arg1, \arg3                //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+//  }
 .endm
 
-.macro	NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
-    veor.s16		\arg6, \arg6			// init 0 , and keep 0;
-    vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
-    vmull.s16		\arg7, \arg2, \arg4
-    vmull.s16		\arg8, \arg3, \arg5
-    vshr.s32		\arg7, #16
-    vshr.s32		\arg8, #16
-    vmovn.s32		\arg2, \arg7
-    vmovn.s32		\arg3, \arg8
+.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
+    veor.s16        \arg6, \arg6            // init 0 , and keep 0;
+    vaba.s16        \arg1, \arg0, \arg6     // f + abs(coef - 0)
+    vmull.s16       \arg7, \arg2, \arg4
+    vmull.s16       \arg8, \arg3, \arg5
+    vshr.s32        \arg7, #16
+    vshr.s32        \arg8, #16
+    vmovn.s32       \arg2, \arg7
+    vmovn.s32       \arg3, \arg8
 
-    vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
-    vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		\arg6, #1
-    vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        \arg7, \arg0, #0        // if true, location of coef == 11111111
+    vbif.s16        \arg6, \arg1, \arg7     // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        \arg6, #1
+    vsub.s16        \arg1, \arg1, \arg6     // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro	NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input:	coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
-    veor.s16		\arg6, \arg6			// init 0 , and keep 0;
-    vaba.s16		\arg1, \arg0, \arg6		// f + abs(coef - 0)
-    vmull.s16		\arg7, \arg2, \arg4
-    vmull.s16		\arg8, \arg3, \arg5
-    vshr.s32		\arg7, #16
-    vshr.s32		\arg8, #16
-    vmovn.s32		\arg2, \arg7
-    vmovn.s32		\arg3, \arg8
+.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
+    veor.s16        \arg6, \arg6            // init 0 , and keep 0;
+    vaba.s16        \arg1, \arg0, \arg6     // f + abs(coef - 0)
+    vmull.s16       \arg7, \arg2, \arg4
+    vmull.s16       \arg8, \arg3, \arg5
+    vshr.s32        \arg7, #16
+    vshr.s32        \arg8, #16
+    vmovn.s32       \arg2, \arg7
+    vmovn.s32       \arg3, \arg8
 
-    vcgt.s16		\arg7, \arg0, #0		// if true, location of coef == 11111111
-    vbif.s16		\arg6, \arg1, \arg7		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		\arg6, #1
-    vmax.s16		\arg9, \arg2, \arg3
-    vsub.s16		\arg1, \arg1, \arg6		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        \arg7, \arg0, #0        // if true, location of coef == 11111111
+    vbif.s16        \arg6, \arg1, \arg7     // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        \arg6, #1
+    vmax.s16        \arg9, \arg2, \arg3
+    vsub.s16        \arg1, \arg1, \arg6     // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro	QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
-//	{	//	input:	coef, ff (dst), mf , working_d (all 0), working_q
-    vaba.s16		\arg1, \arg0, \arg3		// f + abs(coef - 0)
-    vmull.s16		\arg4, \arg1, \arg2		// *= mf
-    vshr.s32		\arg4, #16
-    vmovn.s32		\arg1, \arg4			// >> 16
+.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
+//  {   //  input:  coef, ff (dst), mf , working_d (all 0), working_q
+    vaba.s16        \arg1, \arg0, \arg3     // f + abs(coef - 0)
+    vmull.s16       \arg4, \arg1, \arg2     // *= mf
+    vshr.s32        \arg4, #16
+    vmovn.s32       \arg1, \arg4            // >> 16
 
-    vcgt.s16		\arg2, \arg0, #0		// if true, location of coef == 11111111
-    vbif.s16		\arg3, \arg1, \arg2		// if (x<0) reserved part; else keep 0 untouched
-    vshl.s16		\arg3, #1
-    vsub.s16		\arg1, \arg1, \arg3		// if x > 0, -= 0; else x-= 2x
-//	}
+    vcgt.s16        \arg2, \arg0, #0        // if true, location of coef == 11111111
+    vbif.s16        \arg3, \arg1, \arg2     // if (x<0) reserved part; else keep 0 untouched
+    vshl.s16        \arg3, #1
+    vsub.s16        \arg1, \arg1, \arg3     // if x > 0, -= 0; else x-= 2x
+//  }
 .endm
 
-.macro	DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
-//	{	//	input:	coef, dst_d, working_d (all 0x01)
-    vceq.s16	\arg1, \arg0, #0
-    vand.s16	\arg1, \arg2
-    vpadd.s16	\arg1, \arg1, \arg1
-    vpadd.s16	\arg1, \arg1, \arg1
-//	}
+.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
+//  {   //  input:  coef, dst_d, working_d (all 0x01)
+    vceq.s16    \arg1, \arg0, #0
+    vand.s16    \arg1, \arg2
+    vpadd.s16   \arg1, \arg1, \arg1
+    vpadd.s16   \arg1, \arg1, \arg1
+//  }
 .endm
 
-.macro	SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
-//	{	//	input:	coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
-    vmax.s16		\arg2, \arg0, \arg1		// max 1st in \arg3 & max 2nd in \arg4
-    vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
-    vpmax.s16		\arg3, \arg3, \arg4		// max 1st in \arg3[0][1]
-//	}
+.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
+//  {   //  input:  coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
+    vmax.s16        \arg2, \arg0, \arg1     // max 1st in \arg3 & max 2nd in \arg4
+    vpmax.s16       \arg3, \arg3, \arg4     // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
+    vpmax.s16       \arg3, \arg3, \arg4     // max 1st in \arg3[0][1]
+//  }
 .endm
 
-.macro	ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
-//	{	//	input:	coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
-    vceq.s16	\arg0, #0
-    vceq.s16	\arg1, #0
-    vand.s16	\arg0, \arg2
-    vand.s16	\arg1, \arg2
+.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
+//  {   //  input:  coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
+    vceq.s16    \arg0, #0
+    vceq.s16    \arg1, #0
+    vand.s16    \arg0, \arg2
+    vand.s16    \arg1, \arg2
 
-    vpadd.s16	\arg3, \arg3, \arg5
-    vpadd.s16	\arg4, \arg4, \arg6
-    vpadd.s16	\arg3, \arg3, \arg4		// 8-->4
-    vpadd.s16	\arg3, \arg3, \arg3
-    vpadd.s16	\arg3, \arg3, \arg3
-//	}
+    vpadd.s16   \arg3, \arg3, \arg5
+    vpadd.s16   \arg4, \arg4, \arg6
+    vpadd.s16   \arg3, \arg3, \arg4     // 8-->4
+    vpadd.s16   \arg3, \arg3, \arg3
+    vpadd.s16   \arg3, \arg3, \arg3
+//  }
 .endm
 
-.macro	HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
-//	{	//	input: src_d[0]~[3], working_d, dst_d
-    vshr.s64	\arg1, \arg0, #32
-    vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
-    vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
-    vtrn.s16	\arg2, \arg1
-    vtrn.s32	\arg2, \arg1
-//	}
+.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
+//  {   //  input: src_d[0]~[3], working_d, dst_d
+    vshr.s64    \arg1, \arg0, #32
+    vadd.s16    \arg2, \arg0, \arg1     // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
+    vsub.s16    \arg1, \arg0, \arg1     // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
+    vtrn.s16    \arg2, \arg1
+    vtrn.s32    \arg2, \arg1
+//  }
 .endm
 
-.macro	IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
-//	{	//	input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
-    vshr.s64	\arg1, \arg0, #32
-    vadd.s16	\arg2, \arg0, \arg1		// [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
-    vsub.s16	\arg1, \arg0, \arg1		// [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
-    vtrn.s16	\arg2, \arg1
-    vrev32.16	\arg1, \arg1
-    vtrn.s32	\arg2, \arg1			// [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
+.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
+//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
+    vshr.s64    \arg1, \arg0, #32
+    vadd.s16    \arg2, \arg0, \arg1     // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
+    vsub.s16    \arg1, \arg0, \arg1     // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
+    vtrn.s16    \arg2, \arg1
+    vrev32.16   \arg1, \arg1
+    vtrn.s32    \arg2, \arg1            // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
 
-    vrev64.16	\arg1, \arg2
-    vadd.s16	\arg0, \arg2, \arg1		// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
-    vsub.s16	\arg1, \arg2, \arg1
-    vrev32.16	\arg1, \arg1			// [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
-    vtrn.s32	\arg0, \arg1			// [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
-//	}
+    vrev64.16   \arg1, \arg2
+    vadd.s16    \arg0, \arg2, \arg1     // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
+    vsub.s16    \arg1, \arg2, \arg1
+    vrev32.16   \arg1, \arg1            // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
+    vtrn.s32    \arg0, \arg1            // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
+//  }
 .endm
 
-.macro	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
-//	{	//	input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
-    vmovl.u8		\arg4,\arg0
-    vmovl.u8		\arg5,\arg1
-    vadd.s16		\arg4,\arg2
-    vadd.s16		\arg5,\arg3
-    vqmovun.s16	\arg0,\arg4
-    vqmovun.s16	\arg1,\arg5
-//	}
+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
+//  {   //  input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
+    vmovl.u8        \arg4,\arg0
+    vmovl.u8        \arg5,\arg1
+    vadd.s16        \arg4,\arg2
+    vadd.s16        \arg5,\arg3
+    vqmovun.s16 \arg0,\arg4
+    vqmovun.s16 \arg1,\arg5
+//  }
 .endm
 
-.macro	ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], output: e_d[0]~[3];
-    vadd.s16		\arg4, \arg0, \arg2			//int16 e[i][0] = src[0] + src[2];
-    vsub.s16		\arg5, \arg0, \arg2			//int16 e[i][1] = src[0] - src[2];
-    vshr.s16		\arg6, \arg1, #1
-    vshr.s16		\arg7, \arg3, #1
-    vsub.s16		\arg6, \arg6, \arg3			//int16 e[i][2] = (src[1]>>1)-src[3];
-    vadd.s16		\arg7, \arg1, \arg7			//int16 e[i][3] = src[1] + (src[3]>>1);
-//	}
+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
+    vadd.s16        \arg4, \arg0, \arg2         //int16 e[i][0] = src[0] + src[2];
+    vsub.s16        \arg5, \arg0, \arg2         //int16 e[i][1] = src[0] - src[2];
+    vshr.s16        \arg6, \arg1, #1
+    vshr.s16        \arg7, \arg3, #1
+    vsub.s16        \arg6, \arg6, \arg3         //int16 e[i][2] = (src[1]>>1)-src[3];
+    vadd.s16        \arg7, \arg1, \arg7         //int16 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro	TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s16		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s16		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s16		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s16		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7    // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s16        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s16        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s16        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s16        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
 
-.macro	ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3];
-    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-    vsubl.s16		\arg6, \arg1, \arg3			//int32 e[i][2] = src[1] - src[3];
-    vaddl.s16		\arg7, \arg1, \arg3			//int32 e[i][3] = src[1] + src[3];
-//	}
+.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
+    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
+    vsubl.s16       \arg6, \arg1, \arg3         //int32 e[i][2] = src[1] - src[3];
+    vaddl.s16       \arg7, \arg1, \arg3         //int32 e[i][3] = src[1] + src[3];
+//  }
 .endm
 
-.macro	ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-//	{	//	input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
-    vaddl.s16		\arg4, \arg0, \arg2			//int32 e[i][0] = src[0] + src[2];
-    vsubl.s16		\arg5, \arg0, \arg2			//int32 e[i][1] = src[0] - src[2];
-    vshr.s16		\arg8, \arg1, #1
-    vshr.s16		\arg9, \arg3, #1
-    vsubl.s16		\arg6, \arg8, \arg3			//int32 e[i][2] = (src[1]>>1)-src[3];
-    vaddl.s16		\arg7, \arg1, \arg9			//int32 e[i][3] = src[1] + (src[3]>>1);
-//	}
+.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
+    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
+    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
+    vshr.s16        \arg8, \arg1, #1
+    vshr.s16        \arg9, \arg3, #1
+    vsubl.s16       \arg6, \arg8, \arg3         //int32 e[i][2] = (src[1]>>1)-src[3];
+    vaddl.s16       \arg7, \arg1, \arg9         //int32 e[i][3] = src[1] + (src[3]>>1);
+//  }
 .endm
 
-.macro	TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7	// both row & col transform used
-//	{	//	output: f_q[0]~[3], input: e_q[0]~[3];
-    vadd.s32		\arg0, \arg4, \arg7			//int16 f[i][0] = e[i][0] + e[i][3];
-    vadd.s32		\arg1, \arg5, \arg6			//int16 f[i][1] = e[i][1] + e[i][2];
-    vsub.s32		\arg2, \arg5, \arg6			//int16 f[i][2] = e[i][1] - e[i][2];
-    vsub.s32		\arg3, \arg4, \arg7			//int16 f[i][3] = e[i][0] - e[i][3];
-//	}
+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7  // both row & col transform used
+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
+    vadd.s32        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
+    vadd.s32        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
+    vsub.s32        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
+    vsub.s32        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
+//  }
 .endm
 
-.macro	COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vsub.s32		\arg6, \arg1, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		\arg7, \arg1, \arg3			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
+    vsub.s32        \arg6, \arg1, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        \arg7, \arg1, \arg3         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 
-.macro	COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-//	{	//	input: src_q[0]~[3], output: e_q[0]~[3];
-    vadd.s32		\arg4, \arg0, \arg2			//int32 e[0][j] = f[0][j] + f[2][j];
-    vsub.s32		\arg5, \arg0, \arg2			//int32 e[1][j] = f[0][j] - f[2][j];
-    vshr.s32		\arg6, \arg1, #1
-    vshr.s32		\arg7, \arg3, #1
-    vsub.s32		\arg6, \arg6, \arg3			//int32 e[2][j] = (f[1][j]>>1) - f[3][j];
-    vadd.s32		\arg7, \arg1, \arg7			//int32 e[3][j] = f[1][j] + (f[3][j]>>1);
-//	}
+.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
+    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
+    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
+    vshr.s32        \arg6, \arg1, #1
+    vshr.s32        \arg7, \arg3, #1
+    vsub.s32        \arg6, \arg6, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
+    vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
+//  }
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN WelsDctT4_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	LOAD_4x4_DATA_FOR_DCT	d4, d5, d6, d7, r1, r2, r3, r4
+    LOAD_4x4_DATA_FOR_DCT   d4, d5, d6, d7, r1, r2, r3, r4
 
-	vsubl.u8	q0, d4, d6
-	vsubl.u8	q1, d5, d7
-	vtrn.s32	q0, q1
-	vswp		d1, d2
+    vsubl.u8    q0, d4, d6
+    vsubl.u8    q1, d5, d7
+    vtrn.s32    q0, q1
+    vswp        d1, d2
 
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    // horizontal transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    //  vertical transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	vst1.s16		{q0, q1}, [r0]!
+    vst1.s16        {q0, q1}, [r0]!
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	LOAD_8x8_DATA_FOR_DCT	d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+    LOAD_8x8_DATA_FOR_DCT   d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
 
-	vsubl.u8	q0, d16, d20
-	vsubl.u8	q1, d17, d21
-	vsubl.u8	q2, d18, d22
-	vsubl.u8	q3, d19, d23
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    vsubl.u8    q0, d16, d20
+    vsubl.u8    q1, d17, d21
+    vsubl.u8    q2, d18, d22
+    vsubl.u8    q3, d19, d23
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    // horizontal transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    //  vertical transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	vswp		d1, d2
-	vswp		d5, d6
-	vswp		q1, q2
-	vst1.s16		{q0, q1}, [r0]!
-	vst1.s16		{q2, q3}, [r0]!
+    vswp        d1, d2
+    vswp        d5, d6
+    vswp        q1, q2
+    vst1.s16        {q0, q1}, [r0]!
+    vst1.s16        {q2, q3}, [r0]!
 
-	////////////////
-	LOAD_8x8_DATA_FOR_DCT	d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
+    ////////////////
+    LOAD_8x8_DATA_FOR_DCT   d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
 
-	vsubl.u8	q0, d16, d20
-	vsubl.u8	q1, d17, d21
-	vsubl.u8	q2, d18, d22
-	vsubl.u8	q3, d19, d23
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    vsubl.u8    q0, d16, d20
+    vsubl.u8    q1, d17, d21
+    vsubl.u8    q2, d18, d22
+    vsubl.u8    q3, d19, d23
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	// horizontal transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    // horizontal transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	// transform element
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    // transform element
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	//	vertical transform
-	DCT_ROW_TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    //  vertical transform
+    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	vswp		d1, d2
-	vswp		d5, d6
-	vswp		q1, q2
-	vst1.s16		{q0, q1}, [r0]!
-	vst1.s16		{q2, q3}, [r0]!
+    vswp        d1, d2
+    vswp        d5, d6
+    vswp        q1, q2
+    vst1.s16        {q0, q1}, [r0]!
+    vst1.s16        {q2, q3}, [r0]!
 
-	pop		{r4}
+    pop     {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q0, q1}, [r0]
-	vld1.s16		{q3}, [r2]
+    vld1.s16        {q2}, [r1]
+    vld1.s16        {q0, q1}, [r0]
+    vld1.s16        {q3}, [r2]
 
-	vmov			q8, q2
+    vmov            q8, q2
 
-	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q9, q10, q11
-	vst1.s16		{q2}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q0, q2, d4, d5, d6, d7, q9, q10, q11
+    vst1.s16        {q2}, [r0]!
 
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r0]!
 
 WELS_ASM_FUNC_END
 
@@ -627,266 +627,266 @@
 
 WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
 
-	vld1.s16		{q0, q1}, [r0]
-	vdup.s16		q2, r1		// even ff range [0, 768]
-	vdup.s16		q3, r2
+    vld1.s16        {q0, q1}, [r0]
+    vdup.s16        q2, r1      // even ff range [0, 768]
+    vdup.s16        q3, r2
 
-	vmov			q8, q2
+    vmov            q8, q2
 
-	NEWQUANT_COEF_EACH_16BITS	q0, q2, d4, d5, d6, d7, q9, q10, q11
-	vst1.s16		{q2}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q0, q2, d4, d5, d6, d7, q9, q10, q11
+    vst1.s16        {q2}, [r0]!
 
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r0]!
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r0]!
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q3}, [r2]
-	mov				r1, r0
+    vld1.s16        {q2}, [r1]
+    vld1.s16        {q3}, [r2]
+    mov             r1, r0
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q0, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS	q1, q8, d16, d17, d6, d7, q9, q10, q11
-	vst1.s16		{q8}, [r1]!
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
+    vst1.s16        {q8}, [r1]!
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
-	vld1.s16		{q2}, [r1]
-	vld1.s16		{q3}, [r2]
-	mov				r1, r0
+    vld1.s16        {q2}, [r1]
+    vld1.s16        {q3}, [r2]
+    mov             r1, r0
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
-	vst1.s16		{q12}, [r1]!		// then 1st 16 elem in d26 & d28
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+    vst1.s16        {q12}, [r1]!        // then 1st 16 elem in d26 & d28
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
-	vst1.s16		{q12}, [r1]!	// then 2nd 16 elem in d27 & d29
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+    vst1.s16        {q12}, [r1]!    // then 2nd 16 elem in d27 & d29
 
-	SELECT_MAX_IN_ABS_COEF	q13, q14, q0, d0, d1
-	vst1.s32		{d0[0]}, [r3]!
+    SELECT_MAX_IN_ABS_COEF  q13, q14, q0, d0, d1
+    vst1.s32        {d0[0]}, [r3]!
 
-	///////////
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
-	vst1.s16		{q12}, [r1]!		// then 3rd 16 elem in d26 & d28
+    ///////////
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
+    vst1.s16        {q12}, [r1]!        // then 3rd 16 elem in d26 & d28
 
-	vld1.s16		{q0, q1}, [r0]!
-	vmov			q8, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
-	vst1.s16		{q8}, [r1]!
-	vmov			q12, q2
-	NEWQUANT_COEF_EACH_16BITS_MAX	q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
-	vst1.s16		{q12}, [r1]!	// then 4th 16 elem in d27 & d29
+    vld1.s16        {q0, q1}, [r0]!
+    vmov            q8, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
+    vst1.s16        {q8}, [r1]!
+    vmov            q12, q2
+    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
+    vst1.s16        {q12}, [r1]!    // then 4th 16 elem in d27 & d29
 
-	SELECT_MAX_IN_ABS_COEF	q13, q14, q0, d0, d1
-	vst1.s32		{d0[0]}, [r3]!
+    SELECT_MAX_IN_ABS_COEF  q13, q14, q0, d0, d1
+    vst1.s32        {d0[0]}, [r3]!
 
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
-	push	{r2,r3}
-	mov		r2, #64	// 2*16*sizeof(int16_t)
-	add		r3, r1, #32
+    push    {r2,r3}
+    mov     r2, #64 // 2*16*sizeof(int16_t)
+    add     r3, r1, #32
 
-	vld1.s16		{d0}, [r1], r2
-	vld1.s16		{d1}, [r3], r2
-	vld1.s16		{d4}, [r1], r2
-	vld1.s16		{d5}, [r3], r2
-	vld1.s16		{d2}, [r1], r2
-	vld1.s16		{d3}, [r3], r2
-	vld1.s16		{d6}, [r1], r2
-	vld1.s16		{d7}, [r3], r2
-	vtrn.16		q0, q2		// d0[0 4], d1[1 5]
-	vtrn.16		q1, q3		// d2[2 6], d3[3 7]
+    vld1.s16        {d0}, [r1], r2
+    vld1.s16        {d1}, [r3], r2
+    vld1.s16        {d4}, [r1], r2
+    vld1.s16        {d5}, [r3], r2
+    vld1.s16        {d2}, [r1], r2
+    vld1.s16        {d3}, [r3], r2
+    vld1.s16        {d6}, [r1], r2
+    vld1.s16        {d7}, [r3], r2
+    vtrn.16     q0, q2      // d0[0 4], d1[1 5]
+    vtrn.16     q1, q3      // d2[2 6], d3[3 7]
 
-	vld1.s16		{d16}, [r1], r2
-	vld1.s16		{d17}, [r3], r2
-	vld1.s16		{d20}, [r1], r2
-	vld1.s16		{d21}, [r3], r2
-	vld1.s16		{d18}, [r1], r2
-	vld1.s16		{d19}, [r3], r2
-	vld1.s16		{d22}, [r1], r2
-	vld1.s16		{d23}, [r3], r2
-	vtrn.16		q8, q10		//d16[08 12],d17[09 13]
-	vtrn.16		q9, q11		//d18[10 14],d19[11 15]
+    vld1.s16        {d16}, [r1], r2
+    vld1.s16        {d17}, [r3], r2
+    vld1.s16        {d20}, [r1], r2
+    vld1.s16        {d21}, [r3], r2
+    vld1.s16        {d18}, [r1], r2
+    vld1.s16        {d19}, [r3], r2
+    vld1.s16        {d22}, [r1], r2
+    vld1.s16        {d23}, [r3], r2
+    vtrn.16     q8, q10     //d16[08 12],d17[09 13]
+    vtrn.16     q9, q11     //d18[10 14],d19[11 15]
 
-	vtrn.32		q0, q8		// d0 [0 4 08 12] = dct[idx],		d1[1 5 09 13] = dct[idx+16]
-	vtrn.32		q1, q9		// d2 [2 6 10 14] = dct[idx+64],	d3[3 7 11 15] = dct[idx+80]
+    vtrn.32     q0, q8      // d0 [0 4 08 12] = dct[idx],       d1[1 5 09 13] = dct[idx+16]
+    vtrn.32     q1, q9      // d2 [2 6 10 14] = dct[idx+64],    d3[3 7 11 15] = dct[idx+80]
 
-	ROW_TRANSFORM_0_STEP	d0, d1, d3, d2, q8, q11, q10, q9
+    ROW_TRANSFORM_0_STEP    d0, d1, d3, d2, q8, q11, q10, q9
 
-	TRANSFORM_4BYTES		q0, q1, q3, q2, q8, q11, q10, q9
+    TRANSFORM_4BYTES        q0, q1, q3, q2, q8, q11, q10, q9
 
-	// transform element 32bits
-	vtrn.s32		q0, q1				//[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
-	vtrn.s32		q2, q3				//[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
-	vswp			d1, d4				//[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
-	vswp			d3, d6				//[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
+    // transform element 32bits
+    vtrn.s32        q0, q1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
+    vtrn.s32        q2, q3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
+    vswp            d1, d4              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
+    vswp            d3, d6              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
 
-	COL_TRANSFORM_0_STEP	q0, q1, q3, q2, q8, q11, q10, q9
+    COL_TRANSFORM_0_STEP    q0, q1, q3, q2, q8, q11, q10, q9
 
-	TRANSFORM_4BYTES		q0, q1, q3, q2, q8, q11, q10, q9
+    TRANSFORM_4BYTES        q0, q1, q3, q2, q8, q11, q10, q9
 
-	vrshrn.s32		d16, q0, #1
-	vrshrn.s32		d17, q1, #1
-	vrshrn.s32		d18, q2, #1
-	vrshrn.s32		d19, q3, #1
-	vst1.16	{q8, q9}, [r0]	//store
+    vrshrn.s32      d16, q0, #1
+    vrshrn.s32      d17, q1, #1
+    vrshrn.s32      d18, q2, #1
+    vrshrn.s32      d19, q3, #1
+    vst1.16 {q8, q9}, [r0]  //store
 
-	pop		{r2,r3}
+    pop     {r2,r3}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
 
-	vdup.s16	d1, r1				//ff
-	vdup.s16	d2, r2				//mf
-	veor		d3, d3
+    vdup.s16    d1, r1              //ff
+    vdup.s16    d2, r2              //mf
+    veor        d3, d3
 
-	mov			r1, #32
-	mov			r2, r0
+    mov         r1, #32
+    mov         r2, r0
 
-	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[00]=0
-	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[16]=0
-	vld1.s16	{d0[2]}, [r0], r1		//rs[32]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[32]=0
-	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
-	vst1.s16	{d3[0]}, [r2], r1		//rs[48]=0
+    vld1.s16    {d0[0]}, [r0], r1       //rs[00]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[00]=0
+    vld1.s16    {d0[1]}, [r0], r1       //rs[16]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[16]=0
+    vld1.s16    {d0[2]}, [r0], r1       //rs[32]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[32]=0
+    vld1.s16    {d0[3]}, [r0], r1       //rs[48]
+    vst1.s16    {d3[0]}, [r2], r1       //rs[48]=0
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d0, d4, d5		// output d5
+    HDM_QUANT_2x2_TOTAL_16BITS  d0, d4, d5      // output d5
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d5, d4, d0		// output d0
+    HDM_QUANT_2x2_TOTAL_16BITS  d5, d4, d0      // output d0
 
-	QUANT_DUALWORD_COEF_EACH_16BITS	d0, d1, d2, d3, q2
+    QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
 
-	vst1.s16	d1, [r3]		// store to dct
-	ldr			r2, [sp, #0]
-	vst1.s16	d1, [r2]		// store to block
+    vst1.s16    d1, [r3]        // store to dct
+    ldr         r2, [sp, #0]
+    vst1.s16    d1, [r2]        // store to block
 
-	mov			r1, #1
-	vdup.s16	d3, r1
-	DC_ZERO_COUNT_IN_DUALWORD	d1, d0, d3
+    mov         r1, #1
+    vdup.s16    d3, r1
+    DC_ZERO_COUNT_IN_DUALWORD   d1, d0, d3
 
-	vmov	r0, r1, d0
-	and		r0, #0x07		// range [0~4]
-	rsb		r0, #4
+    vmov    r0, r1, d0
+    and     r0, #0x07       // range [0~4]
+    rsb     r0, #4
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
 
-	vdup.s16	d3, r1
-	mov			r1, #32
-	vld1.s16	{d0[0]}, [r0], r1		//rs[00]
-	vld1.s16	{d0[1]}, [r0], r1		//rs[16]
-	vld1.s16	{d0[2]}, [r0], r1		//rs[32]
-	vld1.s16	{d0[3]}, [r0], r1		//rs[48]
+    vdup.s16    d3, r1
+    mov         r1, #32
+    vld1.s16    {d0[0]}, [r0], r1       //rs[00]
+    vld1.s16    {d0[1]}, [r0], r1       //rs[16]
+    vld1.s16    {d0[2]}, [r0], r1       //rs[32]
+    vld1.s16    {d0[3]}, [r0], r1       //rs[48]
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d0, d1, d2		// output d2
+    HDM_QUANT_2x2_TOTAL_16BITS  d0, d1, d2      // output d2
 
-	HDM_QUANT_2x2_TOTAL_16BITS	d2, d1, d0		// output d0
+    HDM_QUANT_2x2_TOTAL_16BITS  d2, d1, d0      // output d0
 
-	vabs.s16	d1, d0
-	vcgt.s16	d1, d1, d3		// abs(dct[i])>threshold;
-	vmov	r0, r1, d1
-	orr		r0, r1
+    vabs.s16    d1, d0
+    vcgt.s16    d1, d1, d3      // abs(dct[i])>threshold;
+    vmov    r0, r1, d1
+    orr     r0, r1
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
-	push	{r1}
-	vld1.s16	{q0, q1}, [r0]
-	vmov.s16	q8, #1
+    push    {r1}
+    vld1.s16    {q0, q1}, [r0]
+    vmov.s16    q8, #1
 
-	ZERO_COUNT_IN_2_QUARWORD	q0, q1, q8, d0, d1, d2, d3
-	vmov	r0, r1, d0
-	and		r0, #0x1F	// range [0~16]
-	rsb		r0, #16
-	pop		{r1}
+    ZERO_COUNT_IN_2_QUARWORD    q0, q1, q8, d0, d1, d2, d3
+    vmov    r0, r1, d0
+    and     r0, #0x1F   // range [0~16]
+    rsb     r0, #16
+    pop     {r1}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
-	vld1.s16	{q0, q1}, [r0]
-	vld1.u16	{q2}, [r1]
+    vld1.s16    {q0, q1}, [r0]
+    vld1.u16    {q2}, [r1]
 
-	vmul.s16	q8, q0, q2
-	vmul.s16	q9, q1, q2
+    vmul.s16    q8, q0, q2
+    vmul.s16    q9, q1, q2
 
-	vst1.s16	{q8, q9}, [r0]
+    vst1.s16    {q8, q9}, [r0]
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
-	vld1.u16	{q12}, [r1]
-	mov		r1, r0
-	vld1.s16	{q0, q1}, [r0]!
-	vld1.s16	{q2, q3}, [r0]!
-	vmul.s16	q0, q0, q12
-	vld1.s16	{q8, q9}, [r0]!
-	vmul.s16	q1, q1, q12
-	vld1.s16	{q10, q11}, [r0]!
+    vld1.u16    {q12}, [r1]
+    mov     r1, r0
+    vld1.s16    {q0, q1}, [r0]!
+    vld1.s16    {q2, q3}, [r0]!
+    vmul.s16    q0, q0, q12
+    vld1.s16    {q8, q9}, [r0]!
+    vmul.s16    q1, q1, q12
+    vld1.s16    {q10, q11}, [r0]!
 
-	vst1.s16	{q0, q1}, [r1]!
+    vst1.s16    {q0, q1}, [r1]!
 
-	vmul.s16	q2, q2, q12
-	vmul.s16	q3, q3, q12
-	vmul.s16	q8, q8, q12
-	vst1.s16	{q2, q3}, [r1]!
+    vmul.s16    q2, q2, q12
+    vmul.s16    q3, q3, q12
+    vmul.s16    q8, q8, q12
+    vst1.s16    {q2, q3}, [r1]!
 
-	vmul.s16	q9, q9, q12
-	vmul.s16	q10, q10, q12
-	vmul.s16	q11, q11, q12
-	vst1.s16	{q8, q9}, [r1]!
-	vst1.s16	{q10, q11}, [r1]!
+    vmul.s16    q9, q9, q12
+    vmul.s16    q10, q10, q12
+    vmul.s16    q11, q11, q12
+    vst1.s16    {q8, q9}, [r1]!
+    vst1.s16    {q10, q11}, [r1]!
 
 WELS_ASM_FUNC_END
 
@@ -893,258 +893,258 @@
 
 WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
 
-	vld1.s16	{q0, q1}, [r0]
-	vdup.s16	q8, r1
+    vld1.s16    {q0, q1}, [r0]
+    vdup.s16    q8, r1
 
-	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
-	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
+    IHDM_4x4_TOTAL_16BITS   q0, q2, q3
+    IHDM_4x4_TOTAL_16BITS   q1, q2, q3
 
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	IHDM_4x4_TOTAL_16BITS	q0, q2, q3
-	vmul.s16	q0, q8
+    IHDM_4x4_TOTAL_16BITS   q0, q2, q3
+    vmul.s16    q0, q8
 
-	IHDM_4x4_TOTAL_16BITS	q1, q2, q3
-	vmul.s16	q1, q8
+    IHDM_4x4_TOTAL_16BITS   q1, q2, q3
+    vmul.s16    q1, q8
 
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
-	vst1.s16	{q0, q1}, [r0]
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
+    vst1.s16    {q0, q1}, [r0]
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
-	vld1.u32		{d16[0]}, [r2], r3
-	push			{r4}
-	ldr				r4, [sp, #4]
-	vld1.u32		{d16[1]}, [r2], r3
+    vld1.u32        {d16[0]}, [r2], r3
+    push            {r4}
+    ldr             r4, [sp, #4]
+    vld1.u32        {d16[1]}, [r2], r3
 
-	vld4.s16		{d0, d1, d2, d3}, [r4]		// cost 3 cycles!
-	vld1.u32		{d17[0]}, [r2], r3
-	vld1.u32		{d17[1]}, [r2], r3			// q7 is pred
+    vld4.s16        {d0, d1, d2, d3}, [r4]      // cost 3 cycles!
+    vld1.u32        {d17[0]}, [r2], r3
+    vld1.u32        {d17[1]}, [r2], r3          // q7 is pred
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       d0, d1, d2, d3, d4, d5, d6, d7
 
-	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
 
-	MATRIX_TRANSFORM_EACH_16BITS	d0, d1, d2, d3
+    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       d0, d1, d2, d3, d4, d5, d6, d7
 
-	TRANSFORM_TOTAL_16BITS		d0, d1, d2, d3, d4, d5, d6, d7
-	vrshr.s16		d0, d0, #6
-	vrshr.s16		d1, d1, #6
-	vrshr.s16		d2, d2, #6
-	vrshr.s16		d3, d3, #6
+    TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
+    vrshr.s16       d0, d0, #6
+    vrshr.s16       d1, d1, #6
+    vrshr.s16       d2, d2, #6
+    vrshr.s16       d3, d3, #6
 
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q2,d16
-	vadd.s16		q0,q2
-	vqmovun.s16	d16,q0
-	vst1.32		{d16[0]},[r0],r1
-	vst1.32		{d16[1]},[r0],r1
+    //after rounding 6, clip into [0, 255]
+    vmovl.u8        q2,d16
+    vadd.s16        q0,q2
+    vqmovun.s16 d16,q0
+    vst1.32     {d16[0]},[r0],r1
+    vst1.32     {d16[1]},[r0],r1
 
-	vmovl.u8		q2,d17
-	vadd.s16		q1,q2
-	vqmovun.s16	d17,q1
-	vst1.32		{d17[0]},[r0],r1
-	vst1.32		{d17[1]},[r0]
+    vmovl.u8        q2,d17
+    vadd.s16        q1,q2
+    vqmovun.s16 d17,q1
+    vst1.32     {d17[0]},[r0],r1
+    vst1.32     {d17[1]},[r0]
 
-	pop			{r4}
+    pop         {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
 
-	vld1.u64		{d24}, [r2], r3
-	push			{r4}
-	ldr				r4, [sp, #4]
-	vld1.u64		{d25}, [r2], r3
+    vld1.u64        {d24}, [r2], r3
+    push            {r4}
+    ldr             r4, [sp, #4]
+    vld1.u64        {d25}, [r2], r3
 
-	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!
-	vld1.u64		{d26}, [r2], r3
-	vld1.u64		{d27}, [r2], r3
-	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!
-	vswp			d1, d4
-	vswp			d3, d6
-	vswp			q1, q2						// q0~q3
+    vld4.s16        {d0, d1, d2, d3}, [r4]!     // cost 3 cycles!
+    vld1.u64        {d26}, [r2], r3
+    vld1.u64        {d27}, [r2], r3
+    vld4.s16        {d4, d5, d6, d7}, [r4]!     // cost 3 cycles!
+    vswp            d1, d4
+    vswp            d3, d6
+    vswp            q1, q2                      // q0~q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
-	vrshr.s16		q0, q0, #6
-	vrshr.s16		q1, q1, #6
-	vrshr.s16		q2, q2, #6
-	vrshr.s16		q3, q3, #6
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
+    vrshr.s16       q0, q0, #6
+    vrshr.s16       q1, q1, #6
+    vrshr.s16       q2, q2, #6
+    vrshr.s16       q3, q3, #6
 
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q8,d24
-	vadd.s16		q0,q8
-	vqmovun.s16	d24,q0
-	vst1.u8		{d24},[r0],r1
+    //after rounding 6, clip into [0, 255]
+    vmovl.u8        q8,d24
+    vadd.s16        q0,q8
+    vqmovun.s16 d24,q0
+    vst1.u8     {d24},[r0],r1
 
-	vmovl.u8		q8,d25
-	vadd.s16		q1,q8
-	vqmovun.s16	d25,q1
-	vst1.u8		{d25},[r0],r1
+    vmovl.u8        q8,d25
+    vadd.s16        q1,q8
+    vqmovun.s16 d25,q1
+    vst1.u8     {d25},[r0],r1
 
-	vmovl.u8		q8,d26
-	vadd.s16		q2,q8
-	vqmovun.s16	d26,q2
-	vst1.u8		{d26},[r0],r1
+    vmovl.u8        q8,d26
+    vadd.s16        q2,q8
+    vqmovun.s16 d26,q2
+    vst1.u8     {d26},[r0],r1
 
-	vmovl.u8		q8,d27
-	vadd.s16		q3,q8
-	vqmovun.s16	d27,q3
-	vst1.u8		{d27},[r0],r1
+    vmovl.u8        q8,d27
+    vadd.s16        q3,q8
+    vqmovun.s16 d27,q3
+    vst1.u8     {d27},[r0],r1
 
-	vld1.u64		{d24}, [r2], r3
-	vld1.u64		{d25}, [r2], r3
+    vld1.u64        {d24}, [r2], r3
+    vld1.u64        {d25}, [r2], r3
 
-	vld4.s16		{d0, d1, d2, d3}, [r4]!		// cost 3 cycles!
-	vld1.u64		{d26}, [r2], r3
-	vld1.u64		{d27}, [r2], r3
-	vld4.s16		{d4, d5, d6, d7}, [r4]!		// cost 3 cycles!
-	vswp			d1, d4
-	vswp			d3, d6
-	vswp			q1, q2						// q0~q3
+    vld4.s16        {d0, d1, d2, d3}, [r4]!     // cost 3 cycles!
+    vld1.u64        {d26}, [r2], r3
+    vld1.u64        {d27}, [r2], r3
+    vld4.s16        {d4, d5, d6, d7}, [r4]!     // cost 3 cycles!
+    vswp            d1, d4
+    vswp            d3, d6
+    vswp            q1, q2                      // q0~q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
 
-	MATRIX_TRANSFORM_EACH_16BITS	q0, q1, q2, q3
+    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
 
-	ROW_TRANSFORM_1_STEP_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
 
-	TRANSFORM_TOTAL_16BITS		q0, q1, q2, q3, q8, q9, q10, q11
-	vrshr.s16		q0, q0, #6
-	vrshr.s16		q1, q1, #6
-	vrshr.s16		q2, q2, #6
-	vrshr.s16		q3, q3, #6
+    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
+    vrshr.s16       q0, q0, #6
+    vrshr.s16       q1, q1, #6
+    vrshr.s16       q2, q2, #6
+    vrshr.s16       q3, q3, #6
 
-	//after rounding 6, clip into [0, 255]
-	vmovl.u8		q8,d24
-	vadd.s16		q0,q8
-	vqmovun.s16	d24,q0
-	vst1.u8		{d24},[r0],r1
+    //after rounding 6, clip into [0, 255]
+    vmovl.u8        q8,d24
+    vadd.s16        q0,q8
+    vqmovun.s16 d24,q0
+    vst1.u8     {d24},[r0],r1
 
-	vmovl.u8		q8,d25
-	vadd.s16		q1,q8
-	vqmovun.s16	d25,q1
-	vst1.u8		{d25},[r0],r1
+    vmovl.u8        q8,d25
+    vadd.s16        q1,q8
+    vqmovun.s16 d25,q1
+    vst1.u8     {d25},[r0],r1
 
-	vmovl.u8		q8,d26
-	vadd.s16		q2,q8
-	vqmovun.s16	d26,q2
-	vst1.u8		{d26},[r0],r1
+    vmovl.u8        q8,d26
+    vadd.s16        q2,q8
+    vqmovun.s16 d26,q2
+    vst1.u8     {d26},[r0],r1
 
-	vmovl.u8		q8,d27
-	vadd.s16		q3,q8
-	vqmovun.s16	d27,q3
-	vst1.u8		{d27},[r0],r1
+    vmovl.u8        q8,d27
+    vadd.s16        q3,q8
+    vqmovun.s16 d27,q3
+    vst1.u8     {d27},[r0],r1
 
-	pop			{r4}
+    pop         {r4}
 WELS_ASM_FUNC_END
 
 
 WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
+    push        {r4}
+    ldr         r4, [sp, #4]
 
-	vld1.s16	{q8,q9}, [r4]
-	vrshr.s16		q8, q8, #6
-	vrshr.s16		q9, q9, #6
+    vld1.s16    {q8,q9}, [r4]
+    vrshr.s16       q8, q8, #6
+    vrshr.s16       q9, q9, #6
 
-	vdup.s16	d20, d16[0]
-	vdup.s16	d21, d16[1]
-	vdup.s16	d22, d16[2]
-	vdup.s16	d23, d16[3]
+    vdup.s16    d20, d16[0]
+    vdup.s16    d21, d16[1]
+    vdup.s16    d22, d16[2]
+    vdup.s16    d23, d16[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vdup.s16	d20, d17[0]
-	vdup.s16	d21, d17[1]
-	vdup.s16	d22, d17[2]
-	vdup.s16	d23, d17[3]
+    vdup.s16    d20, d17[0]
+    vdup.s16    d21, d17[1]
+    vdup.s16    d22, d17[2]
+    vdup.s16    d23, d17[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vdup.s16	d20, d18[0]
-	vdup.s16	d21, d18[1]
-	vdup.s16	d22, d18[2]
-	vdup.s16	d23, d18[3]
+    vdup.s16    d20, d18[0]
+    vdup.s16    d21, d18[1]
+    vdup.s16    d22, d18[2]
+    vdup.s16    d23, d18[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vdup.s16	d20, d19[0]
-	vdup.s16	d21, d19[1]
-	vdup.s16	d22, d19[2]
-	vdup.s16	d23, d19[3]
+    vdup.s16    d20, d19[0]
+    vdup.s16    d21, d19[1]
+    vdup.s16    d22, d19[2]
+    vdup.s16    d23, d19[3]
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	vld1.u8	{q0}, [r2], r3
-	MB_PRED_8BITS_ADD_DCT_16BITS_CLIP	d0, d1, q10, q11, q12, q13
-	vst1.u8	{q0}, [r0], r1
+    vld1.u8 {q0}, [r2], r3
+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
+    vst1.u8 {q0}, [r0], r1
 
-	pop			{r4}
+    pop         {r4}
 WELS_ASM_FUNC_END
 #endif
--- a/codec/encoder/core/x86/coeff.asm
+++ b/codec/encoder/core/x86/coeff.asm
@@ -55,262 +55,262 @@
 
 align 16
 byte_1pos_table:
-	db 0,0,0,0,0,0,0,0, ;0
-	db 0,0,0,0,0,0,0,1, ;1
-	db 1,0,0,0,0,0,0,1, ;2
-	db 1,0,0,0,0,0,0,2, ;3
-	db 2,0,0,0,0,0,0,1, ;4
-	db 2,0,0,0,0,0,0,2, ;5
-	db 2,1,0,0,0,0,0,2, ;6
-	db 2,1,0,0,0,0,0,3, ;7
-	db 3,0,0,0,0,0,0,1, ;8
-	db 3,0,0,0,0,0,0,2, ;9
-	db 3,1,0,0,0,0,0,2, ;10
-	db 3,1,0,0,0,0,0,3, ;11
-	db 3,2,0,0,0,0,0,2, ;12
-	db 3,2,0,0,0,0,0,3, ;13
-	db 3,2,1,0,0,0,0,3, ;14
-	db 3,2,1,0,0,0,0,4, ;15
-	db 4,0,0,0,0,0,0,1, ;16
-	db 4,0,0,0,0,0,0,2, ;17
-	db 4,1,0,0,0,0,0,2, ;18
-	db 4,1,0,0,0,0,0,3, ;19
-	db 4,2,0,0,0,0,0,2, ;20
-	db 4,2,0,0,0,0,0,3, ;21
-	db 4,2,1,0,0,0,0,3, ;22
-	db 4,2,1,0,0,0,0,4, ;23
-	db 4,3,0,0,0,0,0,2, ;24
-	db 4,3,0,0,0,0,0,3, ;25
-	db 4,3,1,0,0,0,0,3, ;26
-	db 4,3,1,0,0,0,0,4, ;27
-	db 4,3,2,0,0,0,0,3, ;28
-	db 4,3,2,0,0,0,0,4, ;29
-	db 4,3,2,1,0,0,0,4, ;30
-	db 4,3,2,1,0,0,0,5, ;31
-	db 5,0,0,0,0,0,0,1, ;32
-	db 5,0,0,0,0,0,0,2, ;33
-	db 5,1,0,0,0,0,0,2, ;34
-	db 5,1,0,0,0,0,0,3, ;35
-	db 5,2,0,0,0,0,0,2, ;36
-	db 5,2,0,0,0,0,0,3, ;37
-	db 5,2,1,0,0,0,0,3, ;38
-	db 5,2,1,0,0,0,0,4, ;39
-	db 5,3,0,0,0,0,0,2, ;40
-	db 5,3,0,0,0,0,0,3, ;41
-	db 5,3,1,0,0,0,0,3, ;42
-	db 5,3,1,0,0,0,0,4, ;43
-	db 5,3,2,0,0,0,0,3, ;44
-	db 5,3,2,0,0,0,0,4, ;45
-	db 5,3,2,1,0,0,0,4, ;46
-	db 5,3,2,1,0,0,0,5, ;47
-	db 5,4,0,0,0,0,0,2, ;48
-	db 5,4,0,0,0,0,0,3, ;49
-	db 5,4,1,0,0,0,0,3, ;50
-	db 5,4,1,0,0,0,0,4, ;51
-	db 5,4,2,0,0,0,0,3, ;52
-	db 5,4,2,0,0,0,0,4, ;53
-	db 5,4,2,1,0,0,0,4, ;54
-	db 5,4,2,1,0,0,0,5, ;55
-	db 5,4,3,0,0,0,0,3, ;56
-	db 5,4,3,0,0,0,0,4, ;57
-	db 5,4,3,1,0,0,0,4, ;58
-	db 5,4,3,1,0,0,0,5, ;59
-	db 5,4,3,2,0,0,0,4, ;60
-	db 5,4,3,2,0,0,0,5, ;61
-	db 5,4,3,2,1,0,0,5, ;62
-	db 5,4,3,2,1,0,0,6, ;63
-	db 6,0,0,0,0,0,0,1, ;64
-	db 6,0,0,0,0,0,0,2, ;65
-	db 6,1,0,0,0,0,0,2, ;66
-	db 6,1,0,0,0,0,0,3, ;67
-	db 6,2,0,0,0,0,0,2, ;68
-	db 6,2,0,0,0,0,0,3, ;69
-	db 6,2,1,0,0,0,0,3, ;70
-	db 6,2,1,0,0,0,0,4, ;71
-	db 6,3,0,0,0,0,0,2, ;72
-	db 6,3,0,0,0,0,0,3, ;73
-	db 6,3,1,0,0,0,0,3, ;74
-	db 6,3,1,0,0,0,0,4, ;75
-	db 6,3,2,0,0,0,0,3, ;76
-	db 6,3,2,0,0,0,0,4, ;77
-	db 6,3,2,1,0,0,0,4, ;78
-	db 6,3,2,1,0,0,0,5, ;79
-	db 6,4,0,0,0,0,0,2, ;80
-	db 6,4,0,0,0,0,0,3, ;81
-	db 6,4,1,0,0,0,0,3, ;82
-	db 6,4,1,0,0,0,0,4, ;83
-	db 6,4,2,0,0,0,0,3, ;84
-	db 6,4,2,0,0,0,0,4, ;85
-	db 6,4,2,1,0,0,0,4, ;86
-	db 6,4,2,1,0,0,0,5, ;87
-	db 6,4,3,0,0,0,0,3, ;88
-	db 6,4,3,0,0,0,0,4, ;89
-	db 6,4,3,1,0,0,0,4, ;90
-	db 6,4,3,1,0,0,0,5, ;91
-	db 6,4,3,2,0,0,0,4, ;92
-	db 6,4,3,2,0,0,0,5, ;93
-	db 6,4,3,2,1,0,0,5, ;94
-	db 6,4,3,2,1,0,0,6, ;95
-	db 6,5,0,0,0,0,0,2, ;96
-	db 6,5,0,0,0,0,0,3, ;97
-	db 6,5,1,0,0,0,0,3, ;98
-	db 6,5,1,0,0,0,0,4, ;99
-	db 6,5,2,0,0,0,0,3, ;100
-	db 6,5,2,0,0,0,0,4, ;101
-	db 6,5,2,1,0,0,0,4, ;102
-	db 6,5,2,1,0,0,0,5, ;103
-	db 6,5,3,0,0,0,0,3, ;104
-	db 6,5,3,0,0,0,0,4, ;105
-	db 6,5,3,1,0,0,0,4, ;106
-	db 6,5,3,1,0,0,0,5, ;107
-	db 6,5,3,2,0,0,0,4, ;108
-	db 6,5,3,2,0,0,0,5, ;109
-	db 6,5,3,2,1,0,0,5, ;110
-	db 6,5,3,2,1,0,0,6, ;111
-	db 6,5,4,0,0,0,0,3, ;112
-	db 6,5,4,0,0,0,0,4, ;113
-	db 6,5,4,1,0,0,0,4, ;114
-	db 6,5,4,1,0,0,0,5, ;115
-	db 6,5,4,2,0,0,0,4, ;116
-	db 6,5,4,2,0,0,0,5, ;117
-	db 6,5,4,2,1,0,0,5, ;118
-	db 6,5,4,2,1,0,0,6, ;119
-	db 6,5,4,3,0,0,0,4, ;120
-	db 6,5,4,3,0,0,0,5, ;121
-	db 6,5,4,3,1,0,0,5, ;122
-	db 6,5,4,3,1,0,0,6, ;123
-	db 6,5,4,3,2,0,0,5, ;124
-	db 6,5,4,3,2,0,0,6, ;125
-	db 6,5,4,3,2,1,0,6, ;126
-	db 6,5,4,3,2,1,0,7, ;127
-	db 7,0,0,0,0,0,0,1, ;128
-	db 7,0,0,0,0,0,0,2, ;129
-	db 7,1,0,0,0,0,0,2, ;130
-	db 7,1,0,0,0,0,0,3, ;131
-	db 7,2,0,0,0,0,0,2, ;132
-	db 7,2,0,0,0,0,0,3, ;133
-	db 7,2,1,0,0,0,0,3, ;134
-	db 7,2,1,0,0,0,0,4, ;135
-	db 7,3,0,0,0,0,0,2, ;136
-	db 7,3,0,0,0,0,0,3, ;137
-	db 7,3,1,0,0,0,0,3, ;138
-	db 7,3,1,0,0,0,0,4, ;139
-	db 7,3,2,0,0,0,0,3, ;140
-	db 7,3,2,0,0,0,0,4, ;141
-	db 7,3,2,1,0,0,0,4, ;142
-	db 7,3,2,1,0,0,0,5, ;143
-	db 7,4,0,0,0,0,0,2, ;144
-	db 7,4,0,0,0,0,0,3, ;145
-	db 7,4,1,0,0,0,0,3, ;146
-	db 7,4,1,0,0,0,0,4, ;147
-	db 7,4,2,0,0,0,0,3, ;148
-	db 7,4,2,0,0,0,0,4, ;149
-	db 7,4,2,1,0,0,0,4, ;150
-	db 7,4,2,1,0,0,0,5, ;151
-	db 7,4,3,0,0,0,0,3, ;152
-	db 7,4,3,0,0,0,0,4, ;153
-	db 7,4,3,1,0,0,0,4, ;154
-	db 7,4,3,1,0,0,0,5, ;155
-	db 7,4,3,2,0,0,0,4, ;156
-	db 7,4,3,2,0,0,0,5, ;157
-	db 7,4,3,2,1,0,0,5, ;158
-	db 7,4,3,2,1,0,0,6, ;159
-	db 7,5,0,0,0,0,0,2, ;160
-	db 7,5,0,0,0,0,0,3, ;161
-	db 7,5,1,0,0,0,0,3, ;162
-	db 7,5,1,0,0,0,0,4, ;163
-	db 7,5,2,0,0,0,0,3, ;164
-	db 7,5,2,0,0,0,0,4, ;165
-	db 7,5,2,1,0,0,0,4, ;166
-	db 7,5,2,1,0,0,0,5, ;167
-	db 7,5,3,0,0,0,0,3, ;168
-	db 7,5,3,0,0,0,0,4, ;169
-	db 7,5,3,1,0,0,0,4, ;170
-	db 7,5,3,1,0,0,0,5, ;171
-	db 7,5,3,2,0,0,0,4, ;172
-	db 7,5,3,2,0,0,0,5, ;173
-	db 7,5,3,2,1,0,0,5, ;174
-	db 7,5,3,2,1,0,0,6, ;175
-	db 7,5,4,0,0,0,0,3, ;176
-	db 7,5,4,0,0,0,0,4, ;177
-	db 7,5,4,1,0,0,0,4, ;178
-	db 7,5,4,1,0,0,0,5, ;179
-	db 7,5,4,2,0,0,0,4, ;180
-	db 7,5,4,2,0,0,0,5, ;181
-	db 7,5,4,2,1,0,0,5, ;182
-	db 7,5,4,2,1,0,0,6, ;183
-	db 7,5,4,3,0,0,0,4, ;184
-	db 7,5,4,3,0,0,0,5, ;185
-	db 7,5,4,3,1,0,0,5, ;186
-	db 7,5,4,3,1,0,0,6, ;187
-	db 7,5,4,3,2,0,0,5, ;188
-	db 7,5,4,3,2,0,0,6, ;189
-	db 7,5,4,3,2,1,0,6, ;190
-	db 7,5,4,3,2,1,0,7, ;191
-	db 7,6,0,0,0,0,0,2, ;192
-	db 7,6,0,0,0,0,0,3, ;193
-	db 7,6,1,0,0,0,0,3, ;194
-	db 7,6,1,0,0,0,0,4, ;195
-	db 7,6,2,0,0,0,0,3, ;196
-	db 7,6,2,0,0,0,0,4, ;197
-	db 7,6,2,1,0,0,0,4, ;198
-	db 7,6,2,1,0,0,0,5, ;199
-	db 7,6,3,0,0,0,0,3, ;200
-	db 7,6,3,0,0,0,0,4, ;201
-	db 7,6,3,1,0,0,0,4, ;202
-	db 7,6,3,1,0,0,0,5, ;203
-	db 7,6,3,2,0,0,0,4, ;204
-	db 7,6,3,2,0,0,0,5, ;205
-	db 7,6,3,2,1,0,0,5, ;206
-	db 7,6,3,2,1,0,0,6, ;207
-	db 7,6,4,0,0,0,0,3, ;208
-	db 7,6,4,0,0,0,0,4, ;209
-	db 7,6,4,1,0,0,0,4, ;210
-	db 7,6,4,1,0,0,0,5, ;211
-	db 7,6,4,2,0,0,0,4, ;212
-	db 7,6,4,2,0,0,0,5, ;213
-	db 7,6,4,2,1,0,0,5, ;214
-	db 7,6,4,2,1,0,0,6, ;215
-	db 7,6,4,3,0,0,0,4, ;216
-	db 7,6,4,3,0,0,0,5, ;217
-	db 7,6,4,3,1,0,0,5, ;218
-	db 7,6,4,3,1,0,0,6, ;219
-	db 7,6,4,3,2,0,0,5, ;220
-	db 7,6,4,3,2,0,0,6, ;221
-	db 7,6,4,3,2,1,0,6, ;222
-	db 7,6,4,3,2,1,0,7, ;223
-	db 7,6,5,0,0,0,0,3, ;224
-	db 7,6,5,0,0,0,0,4, ;225
-	db 7,6,5,1,0,0,0,4, ;226
-	db 7,6,5,1,0,0,0,5, ;227
-	db 7,6,5,2,0,0,0,4, ;228
-	db 7,6,5,2,0,0,0,5, ;229
-	db 7,6,5,2,1,0,0,5, ;230
-	db 7,6,5,2,1,0,0,6, ;231
-	db 7,6,5,3,0,0,0,4, ;232
-	db 7,6,5,3,0,0,0,5, ;233
-	db 7,6,5,3,1,0,0,5, ;234
-	db 7,6,5,3,1,0,0,6, ;235
-	db 7,6,5,3,2,0,0,5, ;236
-	db 7,6,5,3,2,0,0,6, ;237
-	db 7,6,5,3,2,1,0,6, ;238
-	db 7,6,5,3,2,1,0,7, ;239
-	db 7,6,5,4,0,0,0,4, ;240
-	db 7,6,5,4,0,0,0,5, ;241
-	db 7,6,5,4,1,0,0,5, ;242
-	db 7,6,5,4,1,0,0,6, ;243
-	db 7,6,5,4,2,0,0,5, ;244
-	db 7,6,5,4,2,0,0,6, ;245
-	db 7,6,5,4,2,1,0,6, ;246
-	db 7,6,5,4,2,1,0,7, ;247
-	db 7,6,5,4,3,0,0,5, ;248
-	db 7,6,5,4,3,0,0,6, ;249
-	db 7,6,5,4,3,1,0,6, ;250
-	db 7,6,5,4,3,1,0,7, ;251
-	db 7,6,5,4,3,2,0,6, ;252
-	db 7,6,5,4,3,2,0,7, ;253
-	db 7,6,5,4,3,2,1,7, ;254
-	db 7,6,5,4,3,2,1,8, ;255
+    db 0,0,0,0,0,0,0,0, ;0
+    db 0,0,0,0,0,0,0,1, ;1
+    db 1,0,0,0,0,0,0,1, ;2
+    db 1,0,0,0,0,0,0,2, ;3
+    db 2,0,0,0,0,0,0,1, ;4
+    db 2,0,0,0,0,0,0,2, ;5
+    db 2,1,0,0,0,0,0,2, ;6
+    db 2,1,0,0,0,0,0,3, ;7
+    db 3,0,0,0,0,0,0,1, ;8
+    db 3,0,0,0,0,0,0,2, ;9
+    db 3,1,0,0,0,0,0,2, ;10
+    db 3,1,0,0,0,0,0,3, ;11
+    db 3,2,0,0,0,0,0,2, ;12
+    db 3,2,0,0,0,0,0,3, ;13
+    db 3,2,1,0,0,0,0,3, ;14
+    db 3,2,1,0,0,0,0,4, ;15
+    db 4,0,0,0,0,0,0,1, ;16
+    db 4,0,0,0,0,0,0,2, ;17
+    db 4,1,0,0,0,0,0,2, ;18
+    db 4,1,0,0,0,0,0,3, ;19
+    db 4,2,0,0,0,0,0,2, ;20
+    db 4,2,0,0,0,0,0,3, ;21
+    db 4,2,1,0,0,0,0,3, ;22
+    db 4,2,1,0,0,0,0,4, ;23
+    db 4,3,0,0,0,0,0,2, ;24
+    db 4,3,0,0,0,0,0,3, ;25
+    db 4,3,1,0,0,0,0,3, ;26
+    db 4,3,1,0,0,0,0,4, ;27
+    db 4,3,2,0,0,0,0,3, ;28
+    db 4,3,2,0,0,0,0,4, ;29
+    db 4,3,2,1,0,0,0,4, ;30
+    db 4,3,2,1,0,0,0,5, ;31
+    db 5,0,0,0,0,0,0,1, ;32
+    db 5,0,0,0,0,0,0,2, ;33
+    db 5,1,0,0,0,0,0,2, ;34
+    db 5,1,0,0,0,0,0,3, ;35
+    db 5,2,0,0,0,0,0,2, ;36
+    db 5,2,0,0,0,0,0,3, ;37
+    db 5,2,1,0,0,0,0,3, ;38
+    db 5,2,1,0,0,0,0,4, ;39
+    db 5,3,0,0,0,0,0,2, ;40
+    db 5,3,0,0,0,0,0,3, ;41
+    db 5,3,1,0,0,0,0,3, ;42
+    db 5,3,1,0,0,0,0,4, ;43
+    db 5,3,2,0,0,0,0,3, ;44
+    db 5,3,2,0,0,0,0,4, ;45
+    db 5,3,2,1,0,0,0,4, ;46
+    db 5,3,2,1,0,0,0,5, ;47
+    db 5,4,0,0,0,0,0,2, ;48
+    db 5,4,0,0,0,0,0,3, ;49
+    db 5,4,1,0,0,0,0,3, ;50
+    db 5,4,1,0,0,0,0,4, ;51
+    db 5,4,2,0,0,0,0,3, ;52
+    db 5,4,2,0,0,0,0,4, ;53
+    db 5,4,2,1,0,0,0,4, ;54
+    db 5,4,2,1,0,0,0,5, ;55
+    db 5,4,3,0,0,0,0,3, ;56
+    db 5,4,3,0,0,0,0,4, ;57
+    db 5,4,3,1,0,0,0,4, ;58
+    db 5,4,3,1,0,0,0,5, ;59
+    db 5,4,3,2,0,0,0,4, ;60
+    db 5,4,3,2,0,0,0,5, ;61
+    db 5,4,3,2,1,0,0,5, ;62
+    db 5,4,3,2,1,0,0,6, ;63
+    db 6,0,0,0,0,0,0,1, ;64
+    db 6,0,0,0,0,0,0,2, ;65
+    db 6,1,0,0,0,0,0,2, ;66
+    db 6,1,0,0,0,0,0,3, ;67
+    db 6,2,0,0,0,0,0,2, ;68
+    db 6,2,0,0,0,0,0,3, ;69
+    db 6,2,1,0,0,0,0,3, ;70
+    db 6,2,1,0,0,0,0,4, ;71
+    db 6,3,0,0,0,0,0,2, ;72
+    db 6,3,0,0,0,0,0,3, ;73
+    db 6,3,1,0,0,0,0,3, ;74
+    db 6,3,1,0,0,0,0,4, ;75
+    db 6,3,2,0,0,0,0,3, ;76
+    db 6,3,2,0,0,0,0,4, ;77
+    db 6,3,2,1,0,0,0,4, ;78
+    db 6,3,2,1,0,0,0,5, ;79
+    db 6,4,0,0,0,0,0,2, ;80
+    db 6,4,0,0,0,0,0,3, ;81
+    db 6,4,1,0,0,0,0,3, ;82
+    db 6,4,1,0,0,0,0,4, ;83
+    db 6,4,2,0,0,0,0,3, ;84
+    db 6,4,2,0,0,0,0,4, ;85
+    db 6,4,2,1,0,0,0,4, ;86
+    db 6,4,2,1,0,0,0,5, ;87
+    db 6,4,3,0,0,0,0,3, ;88
+    db 6,4,3,0,0,0,0,4, ;89
+    db 6,4,3,1,0,0,0,4, ;90
+    db 6,4,3,1,0,0,0,5, ;91
+    db 6,4,3,2,0,0,0,4, ;92
+    db 6,4,3,2,0,0,0,5, ;93
+    db 6,4,3,2,1,0,0,5, ;94
+    db 6,4,3,2,1,0,0,6, ;95
+    db 6,5,0,0,0,0,0,2, ;96
+    db 6,5,0,0,0,0,0,3, ;97
+    db 6,5,1,0,0,0,0,3, ;98
+    db 6,5,1,0,0,0,0,4, ;99
+    db 6,5,2,0,0,0,0,3, ;100
+    db 6,5,2,0,0,0,0,4, ;101
+    db 6,5,2,1,0,0,0,4, ;102
+    db 6,5,2,1,0,0,0,5, ;103
+    db 6,5,3,0,0,0,0,3, ;104
+    db 6,5,3,0,0,0,0,4, ;105
+    db 6,5,3,1,0,0,0,4, ;106
+    db 6,5,3,1,0,0,0,5, ;107
+    db 6,5,3,2,0,0,0,4, ;108
+    db 6,5,3,2,0,0,0,5, ;109
+    db 6,5,3,2,1,0,0,5, ;110
+    db 6,5,3,2,1,0,0,6, ;111
+    db 6,5,4,0,0,0,0,3, ;112
+    db 6,5,4,0,0,0,0,4, ;113
+    db 6,5,4,1,0,0,0,4, ;114
+    db 6,5,4,1,0,0,0,5, ;115
+    db 6,5,4,2,0,0,0,4, ;116
+    db 6,5,4,2,0,0,0,5, ;117
+    db 6,5,4,2,1,0,0,5, ;118
+    db 6,5,4,2,1,0,0,6, ;119
+    db 6,5,4,3,0,0,0,4, ;120
+    db 6,5,4,3,0,0,0,5, ;121
+    db 6,5,4,3,1,0,0,5, ;122
+    db 6,5,4,3,1,0,0,6, ;123
+    db 6,5,4,3,2,0,0,5, ;124
+    db 6,5,4,3,2,0,0,6, ;125
+    db 6,5,4,3,2,1,0,6, ;126
+    db 6,5,4,3,2,1,0,7, ;127
+    db 7,0,0,0,0,0,0,1, ;128
+    db 7,0,0,0,0,0,0,2, ;129
+    db 7,1,0,0,0,0,0,2, ;130
+    db 7,1,0,0,0,0,0,3, ;131
+    db 7,2,0,0,0,0,0,2, ;132
+    db 7,2,0,0,0,0,0,3, ;133
+    db 7,2,1,0,0,0,0,3, ;134
+    db 7,2,1,0,0,0,0,4, ;135
+    db 7,3,0,0,0,0,0,2, ;136
+    db 7,3,0,0,0,0,0,3, ;137
+    db 7,3,1,0,0,0,0,3, ;138
+    db 7,3,1,0,0,0,0,4, ;139
+    db 7,3,2,0,0,0,0,3, ;140
+    db 7,3,2,0,0,0,0,4, ;141
+    db 7,3,2,1,0,0,0,4, ;142
+    db 7,3,2,1,0,0,0,5, ;143
+    db 7,4,0,0,0,0,0,2, ;144
+    db 7,4,0,0,0,0,0,3, ;145
+    db 7,4,1,0,0,0,0,3, ;146
+    db 7,4,1,0,0,0,0,4, ;147
+    db 7,4,2,0,0,0,0,3, ;148
+    db 7,4,2,0,0,0,0,4, ;149
+    db 7,4,2,1,0,0,0,4, ;150
+    db 7,4,2,1,0,0,0,5, ;151
+    db 7,4,3,0,0,0,0,3, ;152
+    db 7,4,3,0,0,0,0,4, ;153
+    db 7,4,3,1,0,0,0,4, ;154
+    db 7,4,3,1,0,0,0,5, ;155
+    db 7,4,3,2,0,0,0,4, ;156
+    db 7,4,3,2,0,0,0,5, ;157
+    db 7,4,3,2,1,0,0,5, ;158
+    db 7,4,3,2,1,0,0,6, ;159
+    db 7,5,0,0,0,0,0,2, ;160
+    db 7,5,0,0,0,0,0,3, ;161
+    db 7,5,1,0,0,0,0,3, ;162
+    db 7,5,1,0,0,0,0,4, ;163
+    db 7,5,2,0,0,0,0,3, ;164
+    db 7,5,2,0,0,0,0,4, ;165
+    db 7,5,2,1,0,0,0,4, ;166
+    db 7,5,2,1,0,0,0,5, ;167
+    db 7,5,3,0,0,0,0,3, ;168
+    db 7,5,3,0,0,0,0,4, ;169
+    db 7,5,3,1,0,0,0,4, ;170
+    db 7,5,3,1,0,0,0,5, ;171
+    db 7,5,3,2,0,0,0,4, ;172
+    db 7,5,3,2,0,0,0,5, ;173
+    db 7,5,3,2,1,0,0,5, ;174
+    db 7,5,3,2,1,0,0,6, ;175
+    db 7,5,4,0,0,0,0,3, ;176
+    db 7,5,4,0,0,0,0,4, ;177
+    db 7,5,4,1,0,0,0,4, ;178
+    db 7,5,4,1,0,0,0,5, ;179
+    db 7,5,4,2,0,0,0,4, ;180
+    db 7,5,4,2,0,0,0,5, ;181
+    db 7,5,4,2,1,0,0,5, ;182
+    db 7,5,4,2,1,0,0,6, ;183
+    db 7,5,4,3,0,0,0,4, ;184
+    db 7,5,4,3,0,0,0,5, ;185
+    db 7,5,4,3,1,0,0,5, ;186
+    db 7,5,4,3,1,0,0,6, ;187
+    db 7,5,4,3,2,0,0,5, ;188
+    db 7,5,4,3,2,0,0,6, ;189
+    db 7,5,4,3,2,1,0,6, ;190
+    db 7,5,4,3,2,1,0,7, ;191
+    db 7,6,0,0,0,0,0,2, ;192
+    db 7,6,0,0,0,0,0,3, ;193
+    db 7,6,1,0,0,0,0,3, ;194
+    db 7,6,1,0,0,0,0,4, ;195
+    db 7,6,2,0,0,0,0,3, ;196
+    db 7,6,2,0,0,0,0,4, ;197
+    db 7,6,2,1,0,0,0,4, ;198
+    db 7,6,2,1,0,0,0,5, ;199
+    db 7,6,3,0,0,0,0,3, ;200
+    db 7,6,3,0,0,0,0,4, ;201
+    db 7,6,3,1,0,0,0,4, ;202
+    db 7,6,3,1,0,0,0,5, ;203
+    db 7,6,3,2,0,0,0,4, ;204
+    db 7,6,3,2,0,0,0,5, ;205
+    db 7,6,3,2,1,0,0,5, ;206
+    db 7,6,3,2,1,0,0,6, ;207
+    db 7,6,4,0,0,0,0,3, ;208
+    db 7,6,4,0,0,0,0,4, ;209
+    db 7,6,4,1,0,0,0,4, ;210
+    db 7,6,4,1,0,0,0,5, ;211
+    db 7,6,4,2,0,0,0,4, ;212
+    db 7,6,4,2,0,0,0,5, ;213
+    db 7,6,4,2,1,0,0,5, ;214
+    db 7,6,4,2,1,0,0,6, ;215
+    db 7,6,4,3,0,0,0,4, ;216
+    db 7,6,4,3,0,0,0,5, ;217
+    db 7,6,4,3,1,0,0,5, ;218
+    db 7,6,4,3,1,0,0,6, ;219
+    db 7,6,4,3,2,0,0,5, ;220
+    db 7,6,4,3,2,0,0,6, ;221
+    db 7,6,4,3,2,1,0,6, ;222
+    db 7,6,4,3,2,1,0,7, ;223
+    db 7,6,5,0,0,0,0,3, ;224
+    db 7,6,5,0,0,0,0,4, ;225
+    db 7,6,5,1,0,0,0,4, ;226
+    db 7,6,5,1,0,0,0,5, ;227
+    db 7,6,5,2,0,0,0,4, ;228
+    db 7,6,5,2,0,0,0,5, ;229
+    db 7,6,5,2,1,0,0,5, ;230
+    db 7,6,5,2,1,0,0,6, ;231
+    db 7,6,5,3,0,0,0,4, ;232
+    db 7,6,5,3,0,0,0,5, ;233
+    db 7,6,5,3,1,0,0,5, ;234
+    db 7,6,5,3,1,0,0,6, ;235
+    db 7,6,5,3,2,0,0,5, ;236
+    db 7,6,5,3,2,0,0,6, ;237
+    db 7,6,5,3,2,1,0,6, ;238
+    db 7,6,5,3,2,1,0,7, ;239
+    db 7,6,5,4,0,0,0,4, ;240
+    db 7,6,5,4,0,0,0,5, ;241
+    db 7,6,5,4,1,0,0,5, ;242
+    db 7,6,5,4,1,0,0,6, ;243
+    db 7,6,5,4,2,0,0,5, ;244
+    db 7,6,5,4,2,0,0,6, ;245
+    db 7,6,5,4,2,1,0,6, ;246
+    db 7,6,5,4,2,1,0,7, ;247
+    db 7,6,5,4,3,0,0,5, ;248
+    db 7,6,5,4,3,0,0,6, ;249
+    db 7,6,5,4,3,1,0,6, ;250
+    db 7,6,5,4,3,1,0,7, ;251
+    db 7,6,5,4,3,2,0,6, ;252
+    db 7,6,5,4,3,2,0,7, ;253
+    db 7,6,5,4,3,2,1,7, ;254
+    db 7,6,5,4,3,2,1,8, ;255
 
 ;***********************************************************************
 ; Code
@@ -323,43 +323,43 @@
 ;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
-	push ebx
-	push edi
-	push esi
+    push ebx
+    push edi
+    push esi
 
-	mov			eax,	[esp+16]	;coffLevel
-	mov			edi,	[esp+24]	;Level
-	mov			ebx,	[esp+32]	;endIdx
-	cmp			ebx,	3
-	jne			.Level16
-	pxor		xmm1,	xmm1
-	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin
+    mov         eax,    [esp+16]    ;coffLevel
+    mov         edi,    [esp+24]    ;Level
+    mov         ebx,    [esp+32]    ;endIdx
+    cmp         ebx,    3
+    jne         .Level16
+    pxor        xmm1,   xmm1
+    movq        xmm0,   [eax]   ; removed QWORD
+    jmp         .Cal_begin
 .Level16:
-	movdqa		xmm0,	[eax]
-	movdqa		xmm1,	[eax+16]
+    movdqa      xmm0,   [eax]
+    movdqa      xmm1,   [eax+16]
 .Cal_begin:
-    movdqa		xmm2,	xmm0
-	packsswb	xmm0,	xmm1
-	movdqa		xmm4,	xmm0
-	pxor		xmm3,	xmm3
-	pcmpgtb		xmm0,	xmm3
-	pcmpgtb		xmm3,	xmm4
-	por			xmm0,	xmm3
-	pmovmskb	edx,	xmm0
-	cmp			edx,	0
-	je near   .return
-	movdqa		xmm6,	[sse2_b_1]
-	pcmpeqw		xmm7,	xmm7	;generate -1
-    mov			ebx,	0xff
-    ;pinsrw		xmm6,	ebx,	3
+    movdqa      xmm2,   xmm0
+    packsswb    xmm0,   xmm1
+    movdqa      xmm4,   xmm0
+    pxor        xmm3,   xmm3
+    pcmpgtb     xmm0,   xmm3
+    pcmpgtb     xmm3,   xmm4
+    por         xmm0,   xmm3
+    pmovmskb    edx,    xmm0
+    cmp         edx,    0
+    je near   .return
+    movdqa      xmm6,   [sse2_b_1]
+    pcmpeqw     xmm7,   xmm7    ;generate -1
+    mov         ebx,    0xff
+    ;pinsrw     xmm6,   ebx,    3
 
     mov       bl,   dh
 
-	lea       ebx,  [byte_1pos_table+8*ebx]
-	movq      xmm0, [ebx]
-	pextrw    ecx,  xmm0, 3
-	shr       ecx,  8
+    lea       ebx,  [byte_1pos_table+8*ebx]
+    movq      xmm0, [ebx]
+    pextrw    ecx,  xmm0, 3
+    shr       ecx,  8
     mov       dh,   cl
 
 .loopHighFind0:
@@ -367,19 +367,19 @@
     je        .loopHighFind0End
     ;mov       esi, [ebx]
     ;and       esi, 0xff
-    movzx	  esi, byte [ebx]
+    movzx     esi, byte [ebx]
     add       esi, 8
     mov       esi, [eax+2*esi]
     mov       [edi], si
     add       edi,   2
     ;add       ebx,   1
-    inc		  ebx
+    inc       ebx
     dec       ecx
-	jmp       .loopHighFind0
+    jmp       .loopHighFind0
 .loopHighFind0End:
     mov       cl,   dh
     cmp       cl,   8
-	pand      xmm0, xmm6
+    pand      xmm0, xmm6
     jne       .LowByteFind0
     sub       edi,   2
     mov       esi,   [eax+16]
@@ -387,8 +387,8 @@
     add       edi,   2
 .LowByteFind0:
     and       edx,  0xff
-	lea       ebx,  [byte_1pos_table+8*edx]
-	movq      xmm1, [ebx]
+    lea       ebx,  [byte_1pos_table+8*edx]
+    movq      xmm1, [ebx]
     pextrw    esi,  xmm1, 3
     or        esi,  0xff
     or        ecx,  0xff00
@@ -398,16 +398,16 @@
 .loopLowFind0:
     cmp       esi, 0
     je        .loopLowFind0End
-	;mov       edx, [ebx]
-	;and       edx, 0xff
-	movzx	  edx,	byte [ebx]
-	mov       edx, [eax+2*edx]
-	mov       [edi], dx
-	add       edi,   2
-	;add       ebx,   1
-	inc		  ebx
+    ;mov       edx, [ebx]
+    ;and       edx, 0xff
+    movzx     edx,  byte [ebx]
+    mov       edx, [eax+2*edx]
+    mov       [edi], dx
+    add       edi,   2
+    ;add       ebx,   1
+    inc       ebx
     dec       esi
-	jmp       .loopLowFind0
+    jmp       .loopLowFind0
 .loopLowFind0End:
     cmp       ch,  8
     jne       .getLevelEnd
@@ -415,12 +415,12 @@
     mov       edx, [eax]
     mov       [edi], dx
 .getLevelEnd:
-	mov      edx, [esp+28]	;total_coeffs
+    mov      edx, [esp+28]  ;total_coeffs
     ;mov      ebx,   ecx
     ;and      ebx,   0xff
-    movzx	 ebx,	byte cl
+    movzx    ebx,   byte cl
     add      cl,    ch
-	mov      [edx], cl
+    mov      [edx], cl
 ;getRun
     movq     xmm5, [sse2_b8]
     paddb    xmm0, xmm5
@@ -430,7 +430,7 @@
     sub      eax,  ebx
     shl      eax,  3
     shl      ebx,  3
-	pinsrw   xmm2, ebx, 0
+    pinsrw   xmm2, ebx, 0
     pinsrw   xmm3, eax, 0
     psllq    xmm0, xmm3
     psrlq    xmm0, xmm3
@@ -441,19 +441,19 @@
     por      xmm0,  xmm1
 
     pextrw   eax,   xmm0, 0
-    and		 eax,   0xff
+    and      eax,   0xff
     inc      eax
     sub      al,    cl
-	movdqa   xmm1,  xmm0
-	paddb    xmm1,  xmm7
-	psrldq   xmm0,  1
-	psubb    xmm1,  xmm0
+    movdqa   xmm1,  xmm0
+    paddb    xmm1,  xmm7
+    psrldq   xmm0,  1
+    psubb    xmm1,  xmm0
     mov      ecx,   [esp+20] ;run
-	movdqa   [ecx], xmm1
+    movdqa   [ecx], xmm1
 ;getRunEnd
 .return:
-	pop esi
-	pop edi
-	pop ebx
-	ret
+    pop esi
+    pop edi
+    pop ebx
+    ret
 %endif
--- a/codec/encoder/core/x86/dct.asm
+++ b/codec/encoder/core/x86/dct.asm
@@ -50,17 +50,17 @@
 
 align 16
 SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
-			dw	10, 13, 10, 13, 13, 16, 13, 16,
+            dw  10, 13, 10, 13, 13, 16, 13, 16,
             dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20,
-			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
+            dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  13, 16, 13, 16, 16, 20, 16, 20,
             dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25,
-			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
+            dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  16, 20, 16, 20, 20, 25, 20, 25,
             dw  18, 23, 18, 23, 23, 29, 23, 29,
-			dw  18, 23, 18, 23, 23, 29, 23, 29
+            dw  18, 23, 18, 23, 23, 29, 23, 29
 
 
 ;***********************************************************************
@@ -68,27 +68,27 @@
 ;***********************************************************************
 
 %macro MMX_LoadDiff4P 5
-	movd        %1, [%3]
-	movd        %2, [%4]
-	punpcklbw   %1, %5
-	punpcklbw   %2, %5
-	psubw       %1, %2
+    movd        %1, [%3]
+    movd        %2, [%4]
+    punpcklbw   %1, %5
+    punpcklbw   %2, %5
+    psubw       %1, %2
 %endmacro
 
 %macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
-	MMX_LoadDiff4P %1, %9, %5,    %7,    %10
-	MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
-	lea  %5, [%5+2*%6]
-	lea  %7, [%7+2*%8]
-	MMX_LoadDiff4P %3, %9, %5,    %7,    %10
-	MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
+    MMX_LoadDiff4P %1, %9, %5,    %7,    %10
+    MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
+    lea  %5, [%5+2*%6]
+    lea  %7, [%7+2*%8]
+    MMX_LoadDiff4P %3, %9, %5,    %7,    %10
+    MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
 %endmacro
 
 %macro MMX_SumSubMul2 3
-	movq    %3, %1
-	psllw   %1, $01
-	paddw   %1, %2
-	psllw   %2, $01
+    movq    %3, %1
+    psllw   %1, $01
+    paddw   %1, %2
+    psllw   %2, $01
     psubw   %3, %2
 %endmacro
 
@@ -101,15 +101,15 @@
 %endmacro
 
 %macro MMX_SumSub 3
-	movq    %3, %2
+    movq    %3, %2
     psubw   %2, %1
     paddw   %1, %3
 %endmacro
 
 %macro MMX_DCT 6
-    MMX_SumSub		%4, %1, %6
-    MMX_SumSub		%3, %2, %6
-    MMX_SumSub		%3, %4, %6
+    MMX_SumSub      %4, %1, %6
+    MMX_SumSub      %3, %2, %6
+    MMX_SumSub      %3, %4, %6
     MMX_SumSubMul2  %1, %2, %5
 %endmacro
 
@@ -116,8 +116,8 @@
 %macro MMX_IDCT 6
     MMX_SumSub      %4, %5, %6
     MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
+    MMX_SumSub      %1, %4, %6
+    MMX_SumSub      %3, %5, %6
 %endmacro
 
 %macro MMX_StoreDiff4P 6
@@ -142,11 +142,11 @@
 
     MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
 
-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
-    MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
+    MMX_DCT         mm1, mm2, mm3 ,mm4, mm5, mm6
+    MMX_Trans4x4W   mm3, mm1, mm4, mm5, mm2
 
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
-    MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
+    MMX_DCT         mm3, mm5, mm2 ,mm4, mm1, mm6
+    MMX_Trans4x4W   mm2, mm3, mm4, mm1, mm5
 
     movq    [r0+ 0],   mm2
     movq    [r0+ 8],   mm1
@@ -170,22 +170,22 @@
     movq    mm2, [r4+16]
     movq    mm3, [r4+24]
 
-	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W		mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+    MMX_Trans4x4W       mm0, mm1, mm2, mm3, mm4
+    MMX_IDCT            mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W       mm1, mm3, mm0, mm4, mm2
+    MMX_IDCT            mm3, mm0, mm4, mm2, mm1, mm6
 
-    WELS_Zero			mm7
-    WELS_DW32			mm6
+    WELS_Zero           mm7
+    WELS_DW32           mm6
 
-    MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [r0], [r2]
-    MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+    MMX_StoreDiff4P     mm3, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P     mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
     lea     r0, [r0+2*r1]
     lea     r2, [r2+2*r3]
-    MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [r0], [r2]
-    MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
+    MMX_StoreDiff4P     mm1, mm0, mm6, mm7, [r0], [r2]
+    MMX_StoreDiff4P     mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
 
-	WELSEMMS
+    WELSEMMS
     LOAD_5_PARA_POP
     ret
 
@@ -194,21 +194,21 @@
 ; SSE2 functions
 ;***********************************************************************
 %macro SSE2_Store4x8p 6
-	SSE2_XSawp qdq, %2, %3, %6
-	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2
-	MOVDQ    [%1+0x10], %4
-	MOVDQ    [%1+0x20], %6
-	MOVDQ    [%1+0x30], %3
+    SSE2_XSawp qdq, %2, %3, %6
+    SSE2_XSawp qdq, %4, %5, %3
+    MOVDQ    [%1+0x00], %2
+    MOVDQ    [%1+0x10], %4
+    MOVDQ    [%1+0x20], %6
+    MOVDQ    [%1+0x30], %3
 %endmacro
 
 %macro SSE2_Load4x8p 6
-	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]
-	MOVDQ    %6,	[%1+0x20]
-	MOVDQ    %3,	[%1+0x30]
-	SSE2_XSawp qdq, %4, %3, %5
-	SSE2_XSawp qdq, %2, %6, %3
+    MOVDQ    %2,    [%1+0x00]
+    MOVDQ    %4,    [%1+0x10]
+    MOVDQ    %6,    [%1+0x20]
+    MOVDQ    %3,    [%1+0x30]
+    SSE2_XSawp qdq, %4, %3, %5
+    SSE2_XSawp qdq, %2, %6, %3
 %endmacro
 
 %macro SSE2_SumSubMul2 3
@@ -231,57 +231,57 @@
 %macro SSE2_StoreDiff8p 6
     paddw       %1, %3
     psraw       %1, $06
-    movq		%2, %6
+    movq        %2, %6
     punpcklbw   %2, %4
     paddsw      %2, %1
     packuswb    %2, %2
-    movq	    %5, %2
+    movq        %5, %2
 %endmacro
 
 %macro SSE2_StoreDiff8p 5
-    movq		%2, %5
+    movq        %2, %5
     punpcklbw   %2, %3
     paddsw      %2, %1
     packuswb    %2, %2
-    movq	    %4, %2
+    movq        %4, %2
 %endmacro
 
-%macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1
-	paddw       %1,		%5
-    psraw       %1,		$06		; (dc + 32) >> 6
+%macro SSE2_Load8DC 6
+    movdqa      %1,     %6      ; %1 = dc0 dc1
+    paddw       %1,     %5
+    psraw       %1,     $06     ; (dc + 32) >> 6
 
-    movdqa		%2,		%1
-    psrldq		%2,		4
- 	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
+    movdqa      %2,     %1
+    psrldq      %2,     4
+    punpcklwd   %2,     %2
+    punpckldq   %2,     %2      ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
 
-    movdqa		%3,		%1
-    psrldq		%3,		8
- 	punpcklwd	%3,		%3
-	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
+    movdqa      %3,     %1
+    psrldq      %3,     8
+    punpcklwd   %3,     %3
+    punpckldq   %3,     %3      ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
 
-	movdqa		%4,		%1
-    psrldq		%4,		12
- 	punpcklwd	%4,		%4
-	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
+    movdqa      %4,     %1
+    psrldq      %4,     12
+    punpcklwd   %4,     %4
+    punpckldq   %4,     %4      ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
 
-	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
+    punpcklwd   %1,     %1
+    punpckldq   %1,     %1      ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
 %endmacro
 
 %macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5
-	SSE2_SumSub		%1, %2, %5
-	SSE2_SumSub		%3, %2, %5
-	SSE2_SumSubMul2		%6, %1, %4
+    SSE2_SumSub     %6, %3, %5
+    SSE2_SumSub     %1, %2, %5
+    SSE2_SumSub     %3, %2, %5
+    SSE2_SumSubMul2     %6, %1, %4
 %endmacro
 
 %macro SSE2_IDCT 7
     SSE2_SumSub       %7, %2, %6
     SSE2_SumSubDiv2     %1, %3, %5, %4
-    SSE2_SumSub	     %2, %1, %5
-    SSE2_SumSub		 %7, %4, %5
+    SSE2_SumSub      %2, %1, %5
+    SSE2_SumSub      %7, %4, %5
 %endmacro
 
 ;***********************************************************************
@@ -294,42 +294,42 @@
     SIGN_EXTENSION r2, r2d
     SIGN_EXTENSION r4, r4d
     pxor    xmm7, xmm7
-	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
+    ;Load 4x8
+    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1], [r3]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
-	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
+    lea     r1, [r1 + 2 * r2]
+    lea     r3, [r3 + 2 * r4]
+    SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
-	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0
 
-	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
+    lea     r1, [r1 + 2 * r2]
+    lea     r3, [r3 + 2 * r4]
 
-	;Load 4x8
-	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
+    ;Load 4x8
+    SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [r1      ], [r3    ]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [r1+r2  ], [r3+r4]
-	lea		r1, [r1 + 2 * r2]
-	lea		r3, [r3 + 2 * r4]
+    lea     r1, [r1 + 2 * r2]
+    lea     r3, [r3 + 2 * r4]
     SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [r1], [r3]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
 
-	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
-	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
+    SSE2_DCT            xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
+    SSE2_TransTwo4x4W   xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT            xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
+    SSE2_TransTwo4x4W   xmm4, xmm2, xmm1, xmm3, xmm0
 
-	lea		r0, [r0+64]
-	SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
+    lea     r0, [r0+64]
+    SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
 
-	POP_XMM
-	LOAD_5_PARA_POP
+    POP_XMM
+    LOAD_5_PARA_POP
     ret
 
 
@@ -337,59 +337,59 @@
 ; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
 ;***********************************************************************
 WELS_EXTERN WelsIDctFourT4Rec_sse2
-	%assign push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	;Load 4x8
-	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    ;Load 4x8
+    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
-	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-  	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
-    SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
-    SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
+    SSE2_IDCT           xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
+    SSE2_IDCT           xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
-	WELS_Zero			xmm7
-    WELS_DW32			xmm6
+    WELS_Zero           xmm7
+    WELS_DW32           xmm6
 
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
+    SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0      ],  [r2]
+    SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
+    lea     r0, [r0 + 2 * r1]
+    lea     r2, [r2 + 2 * r3]
+    SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],            [r2]
+    SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
 
-    add		r4, 64
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-   	SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
+    add     r4, 64
+    lea     r0, [r0 + 2 * r1]
+    lea     r2, [r2 + 2 * r3]
+    SSE2_Load4x8p  r4, xmm0, xmm1, xmm4, xmm2, xmm5
 
-	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
+    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
+    SSE2_IDCT           xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
-	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
+    SSE2_IDCT           xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
-	WELS_Zero			xmm7
-    WELS_DW32			xmm6
+    WELS_Zero           xmm7
+    WELS_DW32           xmm6
 
-	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0		],	[r2]
-	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1	],	[r2 + r3]
-	lea		r0, [r0 + 2 * r1]
-	lea		r2, [r2 + 2 * r3]
-	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],			[r2]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],	[r2 + r3]
-	POP_XMM
-	LOAD_5_PARA_POP
-   ; pop		esi
-   ; pop		ebx
+    SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [r0      ],  [r2]
+    SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [r0 + r1 ],  [r2 + r3]
+    lea     r0, [r0 + 2 * r1]
+    lea     r2, [r2 + 2 * r3]
+    SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [r0],            [r2]
+    SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [r0 + r1],   [r2 + r3]
+    POP_XMM
+    LOAD_5_PARA_POP
+    ; pop        esi
+    ; pop        ebx
     ret
 
 %macro SSE2_StoreDiff4x8p 8
-   	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
+    SSE2_StoreDiff8p    %1, %3, %4, [%5],           [%6]
+    SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],      [%6 + %8]
+    SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],       [%6 + 8]
+    SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],  [%6 + %8 + 8]
 %endmacro
 
  ;***********************************************************************
@@ -396,76 +396,76 @@
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
 WELS_EXTERN WelsIDctRecI16x16Dc_sse2
-	%assign push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor		xmm7,		xmm7
-    WELS_DW32	xmm6
+    %assign push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r3, r3d
+    pxor        xmm7,       xmm7
+    WELS_DW32   xmm6
 
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    SSE2_Load8DC            xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
 
-	lea			r0,		[r0 + 2 * r1]
-	lea			r2,		[r2 + 2 * r3]
-	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
-	POP_XMM
-	LOAD_5_PARA_POP
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+    SSE2_StoreDiff4x8p      xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+    POP_XMM
+    LOAD_5_PARA_POP
     ret
 
 
 
 %macro SSE2_SumSubD 3
-	movdqa  %3, %2
+    movdqa  %3, %2
     paddd   %2, %1
     psubd   %1, %3
 %endmacro
 
 %macro SSE2_SumSubDiv2D 4
-	paddd   %1, %2
-	paddd	%1, %3
-	psrad	%1,	 1
-	movdqa	%4, %1
-	psubd	%4, %2
+    paddd   %1, %2
+    paddd   %1, %3
+    psrad   %1,  1
+    movdqa  %4, %1
+    psubd   %4, %2
 %endmacro
-%macro		SSE2_Load4Col	5
-	movsx		r2,		WORD[%5]
- 	movd		%1,			r2d
- 	movsx		r2,		WORD[%5 + 0x20]
- 	movd		%2,			r2d
-	punpckldq	%1,			%2
-	movsx		r2,		WORD[%5 + 0x80]
- 	movd		%3,			r2d
-	movsx		r2,		WORD[%5 + 0xa0]
- 	movd		%4,			r2d
-	punpckldq	%3,			%4
-	punpcklqdq	%1,			%3
+%macro SSE2_Load4Col    5
+    movsx       r2,     WORD[%5]
+    movd        %1,         r2d
+    movsx       r2,     WORD[%5 + 0x20]
+    movd        %2,         r2d
+    punpckldq   %1,         %2
+    movsx       r2,     WORD[%5 + 0x80]
+    movd        %3,         r2d
+    movsx       r2,     WORD[%5 + 0xa0]
+    movd        %4,         r2d
+    punpckldq   %3,         %4
+    punpcklqdq  %1,         %3
 %endmacro
 
 ;***********************************************************************
@@ -472,33 +472,33 @@
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
 WELS_EXTERN WelsHadamardT4Dc_sse2
-		%assign push_num 0
-		LOAD_2_PARA
-		PUSH_XMM 8
-		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, r1
-		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, r1 + 0x40
-		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, r1 + 0x100
-		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, r1 + 0x140
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SSE2_Load4Col       xmm1, xmm5, xmm6, xmm0, r1
+    SSE2_Load4Col       xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+    SSE2_Load4Col       xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+    SSE2_Load4Col       xmm4, xmm5, xmm6, xmm0, r1 + 0x140
 
-		SSE2_SumSubD		xmm1, xmm2, xmm7
-		SSE2_SumSubD		xmm3, xmm4, xmm7
-		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7
+    SSE2_SumSubD        xmm1, xmm2, xmm7
+    SSE2_SumSubD        xmm3, xmm4, xmm7
+    SSE2_SumSubD        xmm2, xmm4, xmm7
+    SSE2_SumSubD        xmm1, xmm3, xmm7
 
-		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
+    SSE2_Trans4x4D      xmm4, xmm2, xmm1, xmm3, xmm5    ; pOut: xmm4,xmm3,xmm5,xmm1
 
-		SSE2_SumSubD		xmm4, xmm3, xmm7
-		SSE2_SumSubD		xmm5, xmm1, xmm7
+    SSE2_SumSubD        xmm4, xmm3, xmm7
+    SSE2_SumSubD        xmm5, xmm1, xmm7
 
-		WELS_DD1 xmm6
-		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
-		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
-        SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
+    WELS_DD1 xmm6
+    SSE2_SumSubDiv2D    xmm3, xmm1, xmm6, xmm0          ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
+    SSE2_SumSubDiv2D    xmm4, xmm5, xmm6, xmm1          ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
+    SSE2_Trans4x4D      xmm3, xmm0, xmm1, xmm4, xmm2    ; pOut: xmm3,xmm4,xmm2,xmm1
 
-		packssdw	xmm3,	xmm4
-		packssdw	xmm2,	xmm1
-		movdqa	[r0+ 0],   xmm3
-		movdqa	[r0+16],   xmm2
+    packssdw    xmm3,   xmm4
+    packssdw    xmm2,   xmm1
+    movdqa  [r0+ 0],   xmm3
+    movdqa  [r0+16],   xmm2
 
-		POP_XMM
-		ret
+    POP_XMM
+    ret
--- a/codec/encoder/core/x86/intra_pred.asm
+++ b/codec/encoder/core/x86/intra_pred.asm
@@ -61,7 +61,7 @@
 sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
 
 align 16
-mmx_01bytes:		times 16	db 1
+mmx_01bytes:        times 16    db 1
 
 align 16
 mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
@@ -73,106 +73,106 @@
 ;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 ;%1 will keep the last result
 %macro SSE_DB_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubb %1, %2
+    pxor %1, %1
+    pcmpeqw %2, %2
+    psubb %1, %2
 %endmacro
 
 ;xmm0, xmm1, xmm2, eax, ecx
 ;lower 64 bits of xmm0 save the result
 %macro SSE2_PRED_H_4X4_TWO_LINE 5
-    movd		%1,	[%4-1]
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
-	movdqa		%3,	%1
-	punpcklbw	%1,	%3
+    movd        %1, [%4-1]
+    movdqa      %3, %1
+    punpcklbw   %1, %3
+    movdqa      %3, %1
+    punpcklbw   %1, %3
 
-	;add			%4,	%5
-	movd		%2,	[%4+%5-1]
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	movdqa		%3,	%2
-	punpcklbw	%2,	%3
-	punpckldq	%1,	%2
+    ;add            %4, %5
+    movd        %2, [%4+%5-1]
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    movdqa      %3, %2
+    punpcklbw   %2, %3
+    punpckldq   %1, %2
 %endmacro
 
-%macro  SUMW_HORIZON1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
+%macro SUMW_HORIZON1 2
+    movdqa      %2, %1
+    psrldq      %2, 8
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    paddusw     %1, %2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    paddusw     %1, %2
 %endmacro
 
-%macro	LOAD_COLUMN 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]
-		movd	%4,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		lea		%5,	[%5+2*%6]
-		punpcklbw %3,	%2
-		punpcklwd %4,	%3
-		punpckhdq %1,	%4
+%macro LOAD_COLUMN 6
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpcklwd %1,   %3
+    lea     %5, [%5+2*%6]
+    movd    %4, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %4,   %2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    lea     %5, [%5+2*%6]
+    punpcklbw %3,   %2
+    punpcklwd %4,   %3
+    punpckhdq %1,   %4
 %endmacro
 
-%macro  SUMW_HORIZON 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%macro SUMW_HORIZON 3
+    movhlps     %2, %1          ; x2 = xx xx xx xx d7 d6 d5 d4
+    paddw       %1, %2          ; x1 = xx xx xx xx d37 d26 d15 d04
+    punpcklwd   %1, %3          ; x1 =  d37  d26 d15 d04
+    movhlps     %2, %1          ; x2 = xxxx xxxx d37 d26
+    paddd       %1, %2          ; x1 = xxxx xxxx d1357 d0246
+    pshuflw     %2, %1, 0x4e    ; x2 = xxxx xxxx d0246 d1357
+    paddd       %1, %2          ; x1 = xxxx xxxx xxxx  d01234567
 %endmacro
 
 
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+%macro COPY_16_TIMES 2
+    movdqa      %2, [%1-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
+%macro COPY_16_TIMESS 3
+    movdqa      %2, [%1+%3-16]
+    psrldq      %2, 15
+    pmuludq     %2, [mmx_01bytes]
+    pshufd      %2, %2, 0
 %endmacro
 
-%macro	LOAD_COLUMN_C 6
-		movd	%1,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %1,%2
-		lea		%5,	[%5+2*%6]
-		movd	%3,	[%5]
-		movd	%2,	[%5+%6]
-		punpcklbw %3,	%2
-		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]
+%macro LOAD_COLUMN_C 6
+    movd    %1, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %1,%2
+    lea     %5, [%5+2*%6]
+    movd    %3, [%5]
+    movd    %2, [%5+%6]
+    punpcklbw %3,   %2
+    punpckhwd %1,   %3
+    lea     %5, [%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
-        lea         r1, [r1+2*r2]
-        movzx		r4, byte [r1-0x01]
-        add			r3, r4
-        movzx		r4, byte [r1+r2-0x01]
-        add			r3, r4
+    lea         r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]
+    add         r3, r4
+    movzx       r4, byte [r1+r2-0x01]
+    add         r3, r4
 %endmacro
 
 ;***********************************************************************
@@ -184,127 +184,127 @@
 ;***********************************************************************
 ;   void WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;
-;	pred must align to 16
+;   pred must align to 16
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredH_sse2
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movzx		r3,	byte [r1-1]
-	movd		xmm0,	r3d
-	pmuludq		xmm0,	[mmx_01bytes]
+    push r3
+    %assign push_num 1
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movzx       r3, byte [r1-1]
+    movd        xmm0,   r3d
+    pmuludq     xmm0,   [mmx_01bytes]
 
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm1,	r3d
-	pmuludq		xmm1,	[mmx_01bytes]
+    movzx       r3, byte [r1+r2-1]
+    movd        xmm1,   r3d
+    pmuludq     xmm1,   [mmx_01bytes]
 
-	unpcklps	xmm0,	xmm1
+    unpcklps    xmm0,   xmm1
 
-	lea			r1,	[r1+r2*2]
-	movzx		r3,	byte [r1-1]
-	movd		xmm2,	r3d
-	pmuludq		xmm2,	[mmx_01bytes]
+    lea         r1, [r1+r2*2]
+    movzx       r3, byte [r1-1]
+    movd        xmm2,   r3d
+    pmuludq     xmm2,   [mmx_01bytes]
 
-	movzx		r3,	byte [r1+r2-1]
-	movd		xmm3,	r3d
-	pmuludq		xmm3,	[mmx_01bytes]
+    movzx       r3, byte [r1+r2-1]
+    movd        xmm3,   r3d
+    pmuludq     xmm3,   [mmx_01bytes]
 
-	unpcklps	xmm2,	xmm3
-	unpcklpd	xmm0,	xmm2
+    unpcklps    xmm2,   xmm3
+    unpcklpd    xmm0,   xmm2
 
-	movdqa		[r0],	xmm0
-	pop r3
-	ret
+    movdqa      [r0],   xmm0
+    pop r3
+    ret
 
 ;***********************************************************************
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r2, r2d
-		sub		r1,	1
-		sub		r1,	r2
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    sub     r1, 1
+    sub     r1, r2
 
-		;for H
-		pxor	xmm7,	xmm7
-		movq	xmm0,	[r1]
-		movdqa	xmm5,	[sse2_plane_dec]
-		punpcklbw xmm0,	xmm7
-		pmullw	xmm0,	xmm5
-		movq	xmm1,	[r1 + 9]
-		movdqa	xmm6,	[sse2_plane_inc]
-		punpcklbw xmm1,	xmm7
-		pmullw	xmm1,	xmm6
-		psubw	xmm1,	xmm0
+    ;for H
+    pxor    xmm7,   xmm7
+    movq    xmm0,   [r1]
+    movdqa  xmm5,   [sse2_plane_dec]
+    punpcklbw xmm0, xmm7
+    pmullw  xmm0,   xmm5
+    movq    xmm1,   [r1 + 9]
+    movdqa  xmm6,   [sse2_plane_inc]
+    punpcklbw xmm1, xmm7
+    pmullw  xmm1,   xmm6
+    psubw   xmm1,   xmm0
 
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6			; b = (5 * H + 32) >> 6;
-		SSE2_Copy8Times	xmm1, r3d	; xmm1 = b,b,b,b,b,b,b,b
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r3d,    xmm1        ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+    movsx   r3, r3w
+    imul    r3, 5
+    add     r3, 32
+    sar     r3, 6           ; b = (5 * H + 32) >> 6;
+    SSE2_Copy8Times xmm1, r3d   ; xmm1 = b,b,b,b,b,b,b,b
 
-		movzx	r4,	BYTE [r1+16]
-		sub	r1, 3
-		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, r1, r2
+    movzx   r4, BYTE [r1+16]
+    sub r1, 3
+    LOAD_COLUMN     xmm0, xmm2, xmm3, xmm4, r1, r2
 
-		add		r1,	3
-		movzx	r3,	BYTE [r1+8*r2]
-		add		r4,	r3
-		shl		r4,	4			;	a = (left[15*stride] + top[15]) << 4;
+    add     r1, 3
+    movzx   r3, BYTE [r1+8*r2]
+    add     r4, r3
+    shl     r4, 4           ;   a = (left[15*stride] + top[15]) << 4;
 
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, r1, r2
-		pxor	xmm4,	xmm4
-		punpckhbw xmm0,	xmm4
-		pmullw	xmm0,	xmm5
-		punpckhbw xmm7,	xmm4
-		pmullw	xmm7,	xmm6
-		psubw	xmm7,	xmm0
+    sub r1, 3
+    add     r1, r2
+    LOAD_COLUMN     xmm7, xmm2, xmm3, xmm4, r1, r2
+    pxor    xmm4,   xmm4
+    punpckhbw xmm0, xmm4
+    pmullw  xmm0,   xmm5
+    punpckhbw xmm7, xmm4
+    pmullw  xmm7,   xmm6
+    psubw   xmm7,   xmm0
 
-		SUMW_HORIZON   xmm7,xmm0,xmm2
-		movd    r3d,   xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	5
-		add		r3,	32
-		sar		r3,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, r3d		; xmm4 = c,c,c,c,c,c,c,c
+    SUMW_HORIZON   xmm7,xmm0,xmm2
+    movd    r3d,   xmm7         ; V
+    movsx   r3, r3w
+    imul    r3, 5
+    add     r3, 32
+    sar     r3, 6               ; c = (5 * V + 32) >> 6;
+    SSE2_Copy8Times xmm4, r3d       ; xmm4 = c,c,c,c,c,c,c,c
 
-		add		r4,	16
-		imul	r3,	-7
-		add		r3,	r4				; s = a + 16 + (-7)*c
-		SSE2_Copy8Times	xmm0, r3d		; xmm0 = s,s,s,s,s,s,s,s
+    add     r4, 16
+    imul    r3, -7
+    add     r3, r4              ; s = a + 16 + (-7)*c
+    SSE2_Copy8Times xmm0, r3d       ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_inc_minus]
+    xor     r3, r3
+    movdqa  xmm5,   [sse2_plane_inc_minus]
 
 get_i16x16_luma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		movdqa	xmm3,	xmm1
-		pmullw	xmm3,	xmm6
-		paddw	xmm3,	xmm0
-		psraw	xmm3,	5
-		packuswb xmm2,	xmm3
-		movdqa	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	16
-		inc		r3
-		cmp		r3,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1
-		POP_XMM
-		pop r4
-		pop r3
-		ret
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    movdqa  xmm3,   xmm1
+    pmullw  xmm3,   xmm6
+    paddw   xmm3,   xmm0
+    psraw   xmm3,   5
+    packuswb xmm2,  xmm3
+    movdqa  [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, 16
+    inc     r3
+    cmp     r3, 16
+    jnz get_i16x16_luma_pred_plane_sse2_1
+    POP_XMM
+    pop r4
+    pop r3
+    ret
 
 ;***********************************************************************
 ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
@@ -311,38 +311,38 @@
 ;***********************************************************************
 
 %macro SSE2_PRED_H_16X16_ONE_LINE 0
-	add r0, 16
-	add r1, r2
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
+    add r0, 16
+    add r1, r2
+    movzx r3, byte [r1]
+    SSE2_Copy16Times xmm0, r3d
+    movdqa [r0], xmm0
 %endmacro
 
 WELS_EXTERN WelsI16x16LumaPredH_sse2
-	push r3
-	%assign push_num 1
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	dec r1
-	movzx r3, byte [r1]
-	SSE2_Copy16Times xmm0, r3d
-	movdqa [r0], xmm0
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	SSE2_PRED_H_16X16_ONE_LINE
-	pop r3
+    push r3
+    %assign push_num 1
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    dec r1
+    movzx r3, byte [r1]
+    SSE2_Copy16Times xmm0, r3d
+    movdqa [r0], xmm0
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    SSE2_PRED_H_16X16_ONE_LINE
+    pop r3
     ret
 
 ;***********************************************************************
@@ -378,289 +378,289 @@
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredPlane_sse2
-		push r3
-		push r4
-		%assign push_num 2
-		LOAD_3_PARA
-		PUSH_XMM 8
-		SIGN_EXTENSION r2, r2d
-		sub		r1,	1
-		sub		r1,	r2
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2, r2d
+    sub     r1, 1
+    sub     r1, r2
 
-		pxor	mm7,	mm7
-		movq	mm0,	[r1]
-		movq	mm5,	[sse2_plane_dec_c]
-		punpcklbw mm0,	mm7
-		pmullw	mm0,	mm5
-		movq	mm1,	[r1 + 5]
-		movq	mm6,	[sse2_plane_inc_c]
-		punpcklbw mm1,	mm7
-		pmullw	mm1,	mm6
-		psubw	mm1,	mm0
+    pxor    mm7,    mm7
+    movq    mm0,    [r1]
+    movq    mm5,    [sse2_plane_dec_c]
+    punpcklbw mm0,  mm7
+    pmullw  mm0,    mm5
+    movq    mm1,    [r1 + 5]
+    movq    mm6,    [sse2_plane_inc_c]
+    punpcklbw mm1,  mm7
+    pmullw  mm1,    mm6
+    psubw   mm1,    mm0
 
-		movq2dq xmm1,   mm1
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm1,xmm0,xmm2
-		movd    r3d,	xmm1
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5			; b = (17 * H + 16) >> 5;
-		SSE2_Copy8Times	xmm1, r3d	; mm1 = b,b,b,b,b,b,b,b
+    movq2dq xmm1,   mm1
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm1,xmm0,xmm2
+    movd    r3d,    xmm1
+    movsx   r3, r3w
+    imul    r3, 17
+    add     r3, 16
+    sar     r3, 5           ; b = (17 * H + 16) >> 5;
+    SSE2_Copy8Times xmm1, r3d   ; mm1 = b,b,b,b,b,b,b,b
 
-		movzx	r3,	BYTE [r1+8]
-		sub	r1, 3
-		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, r1, r2
+    movzx   r3, BYTE [r1+8]
+    sub r1, 3
+    LOAD_COLUMN_C   mm0, mm2, mm3, mm4, r1, r2
 
-		add		r1,	3
-		movzx	r4,	BYTE [r1+4*r2]
-		add		r4,	r3
-		shl		r4,	4			; a = (left[7*stride] + top[7]) << 4;
+    add     r1, 3
+    movzx   r4, BYTE [r1+4*r2]
+    add     r4, r3
+    shl     r4, 4           ; a = (left[7*stride] + top[7]) << 4;
 
-		sub	r1, 3
-		add		r1,	r2
-		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, r1, r2
-		pxor	mm4,	mm4
-		punpckhbw mm0,	mm4
-		pmullw	mm0,	mm5
-		punpckhbw mm7,	mm4
-		pmullw	mm7,	mm6
-		psubw	mm7,	mm0
+    sub r1, 3
+    add     r1, r2
+    LOAD_COLUMN_C   mm7, mm2, mm3, mm4, r1, r2
+    pxor    mm4,    mm4
+    punpckhbw mm0,  mm4
+    pmullw  mm0,    mm5
+    punpckhbw mm7,  mm4
+    pmullw  mm7,    mm6
+    psubw   mm7,    mm0
 
-		movq2dq xmm7,   mm7
-		pxor    xmm2,   xmm2
-		SUMW_HORIZON	xmm7,xmm0,xmm2
-		movd    r3d,    xmm7			; V
-		movsx	r3,	r3w
-		imul	r3,	17
-		add		r3,	16
-		sar		r3,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, r3d	; mm4 = c,c,c,c,c,c,c,c
+    movq2dq xmm7,   mm7
+    pxor    xmm2,   xmm2
+    SUMW_HORIZON    xmm7,xmm0,xmm2
+    movd    r3d,    xmm7            ; V
+    movsx   r3, r3w
+    imul    r3, 17
+    add     r3, 16
+    sar     r3, 5               ; c = (17 * V + 16) >> 5;
+    SSE2_Copy8Times xmm4, r3d   ; mm4 = c,c,c,c,c,c,c,c
 
-		add		r4,	16
-		imul	r3,	-3
-		add		r3,	r4		; s = a + 16 + (-3)*c
-		SSE2_Copy8Times	xmm0, r3d	; xmm0 = s,s,s,s,s,s,s,s
+    add     r4, 16
+    imul    r3, -3
+    add     r3, r4      ; s = a + 16 + (-3)*c
+    SSE2_Copy8Times xmm0, r3d   ; xmm0 = s,s,s,s,s,s,s,s
 
-		xor		r3,	r3
-		movdqa	xmm5,	[sse2_plane_mul_b_c]
+    xor     r3, r3
+    movdqa  xmm5,   [sse2_plane_mul_b_c]
 
 get_i_chroma_pred_plane_sse2_1:
-		movdqa	xmm2,	xmm1
-		pmullw	xmm2,	xmm5
-		paddw	xmm2,	xmm0
-		psraw	xmm2,	5
-		packuswb xmm2,	xmm2
-		movq	[r0],	xmm2
-		paddw	xmm0,	xmm4
-		add		r0,	8
-		inc		r3
-		cmp		r3,	8
-		jnz get_i_chroma_pred_plane_sse2_1
-		POP_XMM
-		pop r4
-		pop r3
-		WELSEMMS
-		ret
+    movdqa  xmm2,   xmm1
+    pmullw  xmm2,   xmm5
+    paddw   xmm2,   xmm0
+    psraw   xmm2,   5
+    packuswb xmm2,  xmm2
+    movq    [r0],   xmm2
+    paddw   xmm0,   xmm4
+    add     r0, 8
+    inc     r3
+    cmp     r3, 8
+    jnz get_i_chroma_pred_plane_sse2_1
+    POP_XMM
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	6 |7 |8 |9 |10|
-;	11|12|13|14|15|
-;	16|17|18|19|20|
-;	21|22|23|24|25|
-;	7 is the start pixel of current 4x4 block
-;	pred[7] = ([6]+[0]*2+[1]+2)/4
+;   0 |1 |2 |3 |4 |
+;   6 |7 |8 |9 |10|
+;   11|12|13|14|15|
+;   16|17|18|19|20|
+;   21|22|23|24|25|
+;   7 is the start pixel of current 4x4 block
+;   pred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDR_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movq        mm1,[r1+r2-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
-	movq        mm2,[r1-8]			;get value of 6 mm2[8] = 6
-	sub		r1, r2			;mov eax to above line of current block(postion of 1)
-	punpckhbw   mm2,[r1-8]			;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
-	movd        mm3,[r1]			;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
-	punpckhwd   mm1,mm2				;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
-	psllq       mm3,18h				;mm3[5]=[1]
-	psrlq       mm1,28h				;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	por         mm3,mm1				;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
-	movq        mm1,mm3				;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
-	lea  	    r1,[r1+r2*2-8h]		;set eax point to 12
-	movq        mm4,[r1+r2]		;get value of 16, mm4[8]=[16]
-	psllq       mm3,8				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[16]
-	por         mm3,mm4				;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
-	movq        mm2,mm3				;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
-	movq        mm4,[r1+r2*2]		;mm4[8]=[21]
-	psllq       mm3,8				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
-	psrlq       mm4,38h				;mm4[1]=[21]
-	por         mm3,mm4				;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
-	movq        mm4,mm3				;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
-	pavgb       mm3,mm1				;mm3=([11]+[21]+1)/2
-	pxor        mm1,mm4				;find odd value in the lowest bit of each byte
-	pand        mm1,[mmx_01bytes]	;set the odd bit
-	psubusb     mm3,mm1				;decrease 1 from odd bytes
-	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movq        mm1,[r1+r2-8]       ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+    movq        mm2,[r1-8]          ;get value of 6 mm2[8] = 6
+    sub     r1, r2          ;mov eax to above line of current block(postion of 1)
+    punpckhbw   mm2,[r1-8]          ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+    movd        mm3,[r1]            ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+    punpckhwd   mm1,mm2             ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+    psllq       mm3,18h             ;mm3[5]=[1]
+    psrlq       mm1,28h             ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    por         mm3,mm1             ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+    movq        mm1,mm3             ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+    lea         r1,[r1+r2*2-8h]     ;set eax point to 12
+    movq        mm4,[r1+r2]     ;get value of 16, mm4[8]=[16]
+    psllq       mm3,8               ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[16]
+    por         mm3,mm4             ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+    movq        mm2,mm3             ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+    movq        mm4,[r1+r2*2]       ;mm4[8]=[21]
+    psllq       mm3,8               ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+    psrlq       mm4,38h             ;mm4[1]=[21]
+    por         mm3,mm4             ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+    movq        mm4,mm3             ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+    pavgb       mm3,mm1             ;mm3=([11]+[21]+1)/2
+    pxor        mm1,mm4             ;find odd value in the lowest bit of each byte
+    pand        mm1,[mmx_01bytes]   ;set the odd bit
+    psubusb     mm3,mm1             ;decrease 1 from odd bytes
+    pavgb       mm2,mm3             ;mm2=(([11]+[21]+1)/2+1+[16])/2
 
-	movd        [r0+12],mm2
-	psrlq       mm2,8
-	movd        [r0+8],mm2
-	psrlq       mm2,8
-	movd        [r0+4],mm2
-	psrlq       mm2,8
-	movd        [r0],mm2
-	WELSEMMS
-	ret
+    movd        [r0+12],mm2
+    psrlq       mm2,8
+    movd        [r0+8],mm2
+    psrlq       mm2,8
+    movd        [r0+4],mm2
+    psrlq       mm2,8
+    movd        [r0],mm2
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	0 |1 |2 |3 |4 |
-;	5 |6 |7 |8 |9 |
-;	10|11|12|13|14|
-;	15|16|17|18|19|
-;	20|21|22|23|24|
-;	6 is the start pixel of current 4x4 block
-;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;   0 |1 |2 |3 |4 |
+;   5 |6 |7 |8 |9 |
+;   10|11|12|13|14|
+;   15|16|17|18|19|
+;   20|21|22|23|24|
+;   6 is the start pixel of current 4x4 block
+;   pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movzx		r4,	byte [r1-1h]
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pxor		xmm1,	xmm1
-	psadbw		xmm0,	xmm1
-	xor r3, r3
-	movd		r3d,	xmm0
-	add			r3,	r4
-	movzx		r4,	byte [r1+r2*2-1h]
-	add			r3,	r4
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movzx       r4, byte [r1-1h]
+    sub         r1, r2
+    movd        xmm0,   [r1]
+    pxor        xmm1,   xmm1
+    psadbw      xmm0,   xmm1
+    xor r3, r3
+    movd        r3d,    xmm0
+    add         r3, r4
+    movzx       r4, byte [r1+r2*2-1h]
+    add         r3, r4
 
-	lea			r1,	[r1+r2*2-1]
-	movzx		r4,	byte [r1+r2]
-	add			r3,	r4
+    lea         r1, [r1+r2*2-1]
+    movzx       r4, byte [r1+r2]
+    add         r3, r4
 
-	movzx		r4,	byte [r1+r2*2]
-	add			r3,	r4
-	add			r3,	4
-	sar			r3,	3
-	imul		r3,	0x01010101
+    movzx       r4, byte [r1+r2*2]
+    add         r3, r4
+    add         r3, 4
+    sar         r3, 3
+    imul        r3, 0x01010101
 
-	movd		xmm0,	r3d
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	pop r4
-	pop r3
-	ret
+    movd        xmm0,   r3d
+    pshufd      xmm0,   xmm0,   0
+    movdqa      [r0],   xmm0
+    pop r4
+    pop r3
+    ret
 
 ;***********************************************************************
-;	void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   void WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy 8 pixel of 8 line from left
 ;***********************************************************************
 %macro MMX_PRED_H_8X8_ONE_LINE 4
-	movq		%1,		[%3-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3-8]
+    psrlq       %1,     38h
 
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
-	movq		%1,		[%3+r2-8]
-	psrlq		%1,		38h
+    movq        %1,     [%3+r2-8]
+    psrlq       %1,     38h
 
-	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
-	pmullw		%1,		[mmx_01bytes]
-	pshufw		%1,		%1,	0
-	movq		[%4],	%1
+    ;pmuludq        %1,     [mmx_01bytes]       ;extend to 4 bytes
+    pmullw      %1,     [mmx_01bytes]
+    pshufw      %1,     %1, 0
+    movq        [%4],   %1
 %endmacro
 
 WELS_EXTERN WelsIChromaPredH_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movq		mm0,	[r1-8]
-	psrlq		mm0,	38h
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movq        mm0,    [r1-8]
+    psrlq       mm0,    38h
 
-	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
-	pmullw		mm0,		[mmx_01bytes]
-	pshufw		mm0,	mm0,	0
-	movq		[r0],	mm0
+    ;pmuludq        mm0,    [mmx_01bytes]       ;extend to 4 bytes
+    pmullw      mm0,        [mmx_01bytes]
+    pshufw      mm0,    mm0,    0
+    movq        [r0],   mm0
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+8
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+8
 
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+16
+    lea         r1,[r1+r2*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+24
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+24
 
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+32
+    lea         r1,[r1+r2*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+40
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+40
 
-	lea			r1,[r1+r2*2]
-	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, r1,r0+48
+    lea         r1,[r1+r2*2]
+    MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, r1,r0+56
-	WELSEMMS
-	ret
+    MMX_PRED_H_8X8_ONE_LINEE    mm0, mm1, r1,r0+56
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   void WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy pixels from top 4 pixels
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredV_sse2
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub			r1,	r2
-	movd		xmm0,	[r1]
-	pshufd		xmm0,	xmm0,	0
-	movdqa		[r0],	xmm0
-	ret
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movd        xmm0,   [r1]
+    pshufd      xmm0,   xmm0,   0
+    movdqa      [r0],   xmm0
+    ret
 
 ;***********************************************************************
-;	void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;   void WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;   copy 8 pixels from top 8 pixels
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredV_sse2
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub		r1,		r2
-	movq		xmm0,		[r1]
-	movdqa		xmm1,		xmm0
-	punpcklqdq	xmm0,		xmm1
-	movdqa		[r0],		xmm0
-	movdqa		[r0+16],	xmm0
-	movdqa		[r0+32],	xmm0
-	movdqa		[r0+48],	xmm0
-	ret
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub     r1,     r2
+    movq        xmm0,       [r1]
+    movdqa      xmm1,       xmm0
+    punpcklqdq  xmm0,       xmm1
+    movdqa      [r0],       xmm0
+    movdqa      [r0+16],    xmm0
+    movdqa      [r0+32],    xmm0
+    movdqa      [r0+48],    xmm0
+    ret
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |a |b |
-;	|g |h |e |f |
-;	|i |j |g |h |
+;   |a |b |c |d |
+;   |e |f |a |b |
+;   |g |h |e |f |
+;   |i |j |g |h |
 
 ;   a = (1 + lt + l0)>>1
 ;   e = (1 + l0 + l1)>>1
@@ -679,68 +679,68 @@
 ;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
-	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movd        mm0, [r1-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+    psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
 
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1+2*r2-4]
-	punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
-	psrlq       mm2, 20h
-	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+    movd        mm1, [r1+2*r2-4]
+    punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r1, [r1+2*r2]
+    movd        mm2, [r1+2*r2-4]
+    punpcklbw   mm2, [r1+r2-4]        ; mm2[7] = l2, mm2[6] = l3
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+    psrlq       mm2, 20h
+    pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
 
-	movq        mm1, mm0
-	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
-	movq        mm2, mm0
-	psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
-	movq        mm3, mm2
-	movq        mm4, mm1
-	pavgb       mm1, mm0
+    movq        mm1, mm0
+    psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+    movq        mm2, mm0
+    psrlq       mm2, 8h                 ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+    movq        mm3, mm2
+    movq        mm4, mm1
+    pavgb       mm1, mm0
 
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm4				; decrease 1 from odd bytes
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
+    pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
 
-	movq        mm4, mm0
-	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
-	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
+    movq        mm4, mm0
+    pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
+    punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
 
-	psrlq       mm2, 20h
-	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
-	movq        mm4, mm3
-	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
-	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
+    psrlq       mm2, 20h
+    psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
+    movq        mm4, mm3
+    psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
+    pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
 
-	movd        [r0], mm2
-	movd        [r0+12], mm3
-	psrlq       mm3, 10h
-	movd        [r0+8], mm3
-	psrlq       mm3, 10h
-	movd        [r0+4], mm3
-	WELSEMMS
-	ret
+    movd        [r0], mm2
+    movd        [r0+12], mm3
+    psrlq       mm3, 10h
+    movd        [r0+8], mm3
+    psrlq       mm3, 10h
+    movd        [r0+4], mm3
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	t3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|c |d |e |f |
-;	|e |f |g |g |
-;	|g |g |g |g |
+;   |a |b |c |d |
+;   |c |d |e |f |
+;   |e |f |g |g |
+;   |g |g |g |g |
 
 ;   a = (1 + l0 + l1)>>1
 ;   c = (1 + l1 + l2)>>1
@@ -756,70 +756,70 @@
 ;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	movd        mm0, [r1-4]            ; mm0[3] = l0
-	punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
-	lea         r1, [r1+2*r2]
-	movd        mm2, [r1-4]            ; mm2[3] = l2
-	movd        mm4, [r1+r2-4]        ; mm4[3] = l3
-	punpcklbw   mm2, mm4
-	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    movd        mm0, [r1-4]            ; mm0[3] = l0
+    punpcklbw   mm0, [r1+r2-4]        ; mm0[7] = l1, mm0[6] = l0
+    lea         r1, [r1+2*r2]
+    movd        mm2, [r1-4]            ; mm2[3] = l2
+    movd        mm4, [r1+r2-4]        ; mm4[3] = l3
+    punpcklbw   mm2, mm4
+    punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
 
-	psrlq       mm4, 18h
-	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
-	psrlq       mm0, 8h
-	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+    psrlq       mm4, 18h
+    psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
+    psrlq       mm0, 8h
+    pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
-	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
-	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+    movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+    pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
-	movq        mm5, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+    movq        mm5, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
-	pand        mm5, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm5				; decrease 1 from odd bytes
+    pxor        mm5, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm5, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm5                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
+    pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
 
-	psrlq       mm2, 8h
-	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
 
-	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
-	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
+    punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
+    punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
 
-	psrlq       mm4, 20h
-	movd        [r0+12], mm4
+    psrlq       mm4, 20h
+    movd        [r0+12], mm4
 
-	movd        [r0], mm1
-	psrlq       mm1, 10h
-	movd        [r0+4], mm1
-	psrlq       mm1, 10h
-	movd        [r0+8], mm1
-	WELSEMMS
-	ret
+    movd        [r0], mm1
+    psrlq       mm1, 10h
+    movd        [r0+4], mm1
+    psrlq       mm1, 10h
+    movd        [r0+8], mm1
+    WELSEMMS
+    ret
 
 
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	l3 will never been used
+;   lt|t0|t1|t2|t3|
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   l3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|i |a |b |c |
-;	|j |e |f |g |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |i |a |b |c |
+;   |j |e |f |g |
 
 ;   a = (1 + lt + t0)>>1
 ;   b = (1 + t0 + t1)>>1
@@ -837,75 +837,75 @@
 ;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
-	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+    psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
 
-	movd        mm1, [r1+2*r2-4]
-	punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
-	lea         r1, [r1+2*r2]
-	movq        mm2, [r1+r2-8]        ; mm2[7] = l2
-	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
-	psrlq       mm2, 28h
-	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+    movd        mm1, [r1+2*r2-4]
+    punpcklbw   mm1, [r1+r2-4]        ; mm1[7] = l0, mm1[6] = l1
+    lea         r1, [r1+2*r2]
+    movq        mm2, [r1+r2-8]        ; mm2[7] = l2
+    punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+    psrlq       mm2, 28h
+    pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
 
-	movq        mm1, mm0
-	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
+    movq        mm1, mm0
+    psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
 
-	movq        mm2, mm0
-	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
-	movq        mm3, mm2
-	pavgb       mm2, mm0
+    movq        mm2, mm0
+    psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+    movq        mm3, mm2
+    pavgb       mm2, mm0
 
-	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm3				; decrease 1 from odd bytes
+    pxor        mm3, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm3                ; decrease 1 from odd bytes
 
-	movq        mm3, mm0
-	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
-	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
-	movq        mm2, mm3
+    movq        mm3, mm0
+    psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+    pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
+    movq        mm2, mm3
 
-	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
-	movd        [r0], mm1
+    psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
+    movd        [r0], mm1
 
-	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
-	movd        [r0+4], mm2
+    psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
+    movd        [r0+4], mm2
 
-	movq        mm4, mm3
-	psllq       mm4, 20h
-	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
+    movq        mm4, mm3
+    psllq       mm4, 20h
+    psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
 
-	movq        mm5, mm3
-	psllq       mm5, 28h
-	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
+    movq        mm5, mm3
+    psllq       mm5, 28h
+    psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
 
-	psllq       mm1, 8h
-	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
-	movd        [r0+8], mm4
+    psllq       mm1, 8h
+    pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
+    movd        [r0+8], mm4
 
-	psllq       mm2, 8h
-	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
-	movd        [r0+12], mm5
-	WELSEMMS
-	ret
+    psllq       mm2, 8h
+    pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
+    movd        [r0+12], mm5
+    WELSEMMS
+    ret
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|b |c |d |e |
-;	|c |d |e |f |
-;	|d |e |f |g |
+;   |a |b |c |d |
+;   |b |c |d |e |
+;   |c |d |e |f |
+;   |d |e |f |g |
 
 ;   a = (2 + t0 + t2 + (t1<<1))>>2
 ;   b = (2 + t1 + t3 + (t2<<1))>>2
@@ -921,54 +921,54 @@
 ;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	movq        mm3, mm0
-	psrlq       mm3, 38h
-	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
+    movq        mm3, mm0
+    psrlq       mm3, 38h
+    psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
 
-	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
-	psrlq       mm2, 8h
-	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+    psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+    psrlq       mm2, 8h
+    pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
 
-	movq        mm3, mm1
-	pavgb       mm1, mm2
-	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
-	pand        mm3, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm1, mm3				; decrease 1 from odd bytes
+    movq        mm3, mm1
+    pavgb       mm1, mm2
+    pxor        mm3, mm2                ; find odd value in the lowest bit of each byte
+    pand        mm3, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm1, mm3                ; decrease 1 from odd bytes
 
-	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
+    pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
 
-	psrlq       mm0, 8h
-	movd        [r0], mm0
-	psrlq       mm0, 8h
-	movd        [r0+4], mm0
-	psrlq       mm0, 8h
-	movd        [r0+8], mm0
-	psrlq       mm0, 8h
-	movd        [r0+12], mm0
-	WELSEMMS
-	ret
+    psrlq       mm0, 8h
+    movd        [r0], mm0
+    psrlq       mm0, 8h
+    movd        [r0+4], mm0
+    psrlq       mm0, 8h
+    movd        [r0+8], mm0
+    psrlq       mm0, 8h
+    movd        [r0+12], mm0
+    WELSEMMS
+    ret
 
 
 ;***********************************************************************
-;	lt|t0|t1|t2|t3|t4|t5|t6|t7
-;	l0|
-;	l1|
-;	l2|
-;	l3|
-;	lt,t0,t1,t2,t3 will never been used
+;   lt|t0|t1|t2|t3|t4|t5|t6|t7
+;   l0|
+;   l1|
+;   l2|
+;   l3|
+;   lt,t0,t1,t2,t3 will never been used
 ;   destination:
-;	|a |b |c |d |
-;	|e |f |g |h |
-;	|b |c |d |i |
-;	|f |g |h |j |
+;   |a |b |c |d |
+;   |e |f |g |h |
+;   |b |c |d |i |
+;   |f |g |h |j |
 
 ;   a = (1 + t0 + t1)>>1
 ;   b = (1 + t1 + t2)>>1
@@ -987,37 +987,37 @@
 ;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-	%assign push_num 0
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
-	movq        mm1, mm0
-	movq        mm2, mm0
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+    movq        mm1, mm0
+    movq        mm2, mm0
 
-	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
-	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+    psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+    psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
-	movq        mm3, mm1
-	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
+    movq        mm3, mm1
+    pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
 
-	movq        mm4, mm2
-	pavgb       mm2, mm0
-	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
-	pand        mm4, [mmx_01bytes]	    ; set the odd bit
-	psubusb     mm2, mm4				; decrease 1 from odd bytes
+    movq        mm4, mm2
+    pavgb       mm2, mm0
+    pxor        mm4, mm0                ; find odd value in the lowest bit of each byte
+    pand        mm4, [mmx_01bytes]      ; set the odd bit
+    psubusb     mm2, mm4                ; decrease 1 from odd bytes
 
-	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
+    pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
 
-	movd        [r0], mm3
-	psrlq       mm3, 8h
-	movd        [r0+8], mm3
+    movd        [r0], mm3
+    psrlq       mm3, 8h
+    movd        [r0+8], mm3
 
-	movd        [r0+4], mm2
-	psrlq       mm2, 8h
-	movd        [r0+12], mm2
-	WELSEMMS
-	ret
+    movd        [r0+4], mm2
+    psrlq       mm2, 8h
+    movd        [r0+12], mm2
+    WELSEMMS
+    ret
 
 ;***********************************************************************
 ;
@@ -1024,88 +1024,88 @@
 ;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movq        mm0, [r1]
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movq        mm0, [r1]
 
-	movzx		r3, byte [r1+r2-0x01] ; l1
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l2
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l3
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l4
-	add		r3, r4
-	movd        	mm1, r3d                 ; mm1 = l1+l2+l3+l4
+    movzx       r3, byte [r1+r2-0x01] ; l1
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l2
+    add     r3, r4
+    movzx       r4, byte [r1+r2-0x01] ; l3
+    add     r3, r4
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l4
+    add     r3, r4
+    movd            mm1, r3d                 ; mm1 = l1+l2+l3+l4
 
-	movzx		r3, byte [r1+r2-0x01] ; l5
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l6
-	add		r3, r4
-	movzx		r4, byte [r1+r2-0x01] ; l7
-	add		r3, r4
-	lea         	r1, [r1+2*r2]
-	movzx		r4, byte [r1-0x01]     ; l8
-	add		r3, r4
-	movd        	mm2, r3d                 ; mm2 = l5+l6+l7+l8
+    movzx       r3, byte [r1+r2-0x01] ; l5
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l6
+    add     r3, r4
+    movzx       r4, byte [r1+r2-0x01] ; l7
+    add     r3, r4
+    lea             r1, [r1+2*r2]
+    movzx       r4, byte [r1-0x01]     ; l8
+    add     r3, r4
+    movd            mm2, r3d                 ; mm2 = l5+l6+l7+l8
 
-	movq        mm3, mm0
-	psrlq       mm0, 0x20
-	psllq       mm3, 0x20
-	psrlq       mm3, 0x20
-	pxor		mm4, mm4
-	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+    movq        mm3, mm0
+    psrlq       mm0, 0x20
+    psllq       mm3, 0x20
+    psrlq       mm3, 0x20
+    pxor        mm4, mm4
+    psadbw      mm0, mm4
+    psadbw      mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
 
-	paddq       mm3, mm1
-	movq        mm1, mm2
-	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+    paddq       mm3, mm1
+    movq        mm1, mm2
+    paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
 
-	movq        mm4, [mmx_0x02]
+    movq        mm4, [mmx_0x02]
 
-	paddq       mm0, mm4
-	psrlq       mm0, 0x02
+    paddq       mm0, mm4
+    psrlq       mm0, 0x02
 
-	paddq       mm2, mm4
-	psrlq       mm2, 0x02
+    paddq       mm2, mm4
+    psrlq       mm2, 0x02
 
-	paddq       mm3, mm4
-	paddq       mm3, mm4
-	psrlq       mm3, 0x03
+    paddq       mm3, mm4
+    paddq       mm3, mm4
+    psrlq       mm3, 0x03
 
-	paddq       mm1, mm4
-	paddq       mm1, mm4
-	psrlq       mm1, 0x03
+    paddq       mm1, mm4
+    paddq       mm1, mm4
+    psrlq       mm1, 0x03
 
-	pmuludq     mm0, [mmx_01bytes]
-	pmuludq     mm3, [mmx_01bytes]
-	psllq       mm0, 0x20
-	pxor        mm0, mm3                 ; mm0 = m_up
+    pmuludq     mm0, [mmx_01bytes]
+    pmuludq     mm3, [mmx_01bytes]
+    psllq       mm0, 0x20
+    pxor        mm0, mm3                 ; mm0 = m_up
 
-	pmuludq     mm2, [mmx_01bytes]
-	pmuludq     mm1, [mmx_01bytes]
-	psllq       mm1, 0x20
-	pxor        mm1, mm2                 ; mm2 = m_down
+    pmuludq     mm2, [mmx_01bytes]
+    pmuludq     mm1, [mmx_01bytes]
+    psllq       mm1, 0x20
+    pxor        mm1, mm2                 ; mm2 = m_down
 
-	movq        [r0], mm0
-	movq        [r0+0x08], mm0
-	movq        [r0+0x10], mm0
-	movq        [r0+0x18], mm0
+    movq        [r0], mm0
+    movq        [r0+0x08], mm0
+    movq        [r0+0x10], mm0
+    movq        [r0+0x18], mm0
 
-	movq        [r0+0x20], mm1
-	movq        [r0+0x28], mm1
-	movq        [r0+0x30], mm1
-	movq        [r0+0x38], mm1
+    movq        [r0+0x20], mm1
+    movq        [r0+0x28], mm1
+    movq        [r0+0x30], mm1
+    movq        [r0+0x38], mm1
 
-	pop r4
-	pop r3
-	WELSEMMS
-	ret
+    pop r4
+    pop r3
+    WELSEMMS
+    ret
 
 
 
@@ -1114,56 +1114,56 @@
 ;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-	push r3
-	push r4
-	%assign push_num 2
-	LOAD_3_PARA
-	SIGN_EXTENSION r2, r2d
-	sub         r1, r2
-	movdqa      xmm0, [r1]             ; read one row
-	pxor		xmm1, xmm1
-	psadbw		xmm0, xmm1
-	movdqa      xmm1, xmm0
-	psrldq      xmm1, 0x08
-	pslldq      xmm0, 0x08
-	psrldq      xmm0, 0x08
-	paddw       xmm0, xmm1
+    push r3
+    push r4
+    %assign push_num 2
+    LOAD_3_PARA
+    SIGN_EXTENSION r2, r2d
+    sub         r1, r2
+    movdqa      xmm0, [r1]             ; read one row
+    pxor        xmm1, xmm1
+    psadbw      xmm0, xmm1
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 0x08
+    pslldq      xmm0, 0x08
+    psrldq      xmm0, 0x08
+    paddw       xmm0, xmm1
 
-	movzx		r3, byte [r1+r2-0x01]
-	movzx		r4, byte [r1+2*r2-0x01]
-	add		r3, r4
-	lea         r1, [r1+r2]
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	LOAD_2_LEFT_AND_ADD
-	add         r3, 0x10
-	movd        xmm1, r3d
-	paddw       xmm0, xmm1
-	psrld       xmm0, 0x05
-	pmuludq     xmm0, [mmx_01bytes]
-	pshufd      xmm0, xmm0, 0
+    movzx       r3, byte [r1+r2-0x01]
+    movzx       r4, byte [r1+2*r2-0x01]
+    add     r3, r4
+    lea         r1, [r1+r2]
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    LOAD_2_LEFT_AND_ADD
+    add         r3, 0x10
+    movd        xmm1, r3d
+    paddw       xmm0, xmm1
+    psrld       xmm0, 0x05
+    pmuludq     xmm0, [mmx_01bytes]
+    pshufd      xmm0, xmm0, 0
 
-	movdqa      [r0], xmm0
-	movdqa      [r0+0x10], xmm0
-	movdqa      [r0+0x20], xmm0
-	movdqa      [r0+0x30], xmm0
-	movdqa      [r0+0x40], xmm0
-	movdqa      [r0+0x50], xmm0
-	movdqa      [r0+0x60], xmm0
-	movdqa      [r0+0x70], xmm0
-	movdqa      [r0+0x80], xmm0
-	movdqa      [r0+0x90], xmm0
-	movdqa      [r0+0xa0], xmm0
-	movdqa      [r0+0xb0], xmm0
-	movdqa      [r0+0xc0], xmm0
-	movdqa      [r0+0xd0], xmm0
-	movdqa      [r0+0xe0], xmm0
-	movdqa      [r0+0xf0], xmm0
+    movdqa      [r0], xmm0
+    movdqa      [r0+0x10], xmm0
+    movdqa      [r0+0x20], xmm0
+    movdqa      [r0+0x30], xmm0
+    movdqa      [r0+0x40], xmm0
+    movdqa      [r0+0x50], xmm0
+    movdqa      [r0+0x60], xmm0
+    movdqa      [r0+0x70], xmm0
+    movdqa      [r0+0x80], xmm0
+    movdqa      [r0+0x90], xmm0
+    movdqa      [r0+0xa0], xmm0
+    movdqa      [r0+0xb0], xmm0
+    movdqa      [r0+0xc0], xmm0
+    movdqa      [r0+0xd0], xmm0
+    movdqa      [r0+0xe0], xmm0
+    movdqa      [r0+0xf0], xmm0
 
-	pop r4
-	pop r3
-	ret
\ No newline at end of file
+    pop r4
+    pop r3
+    ret
\ No newline at end of file
--- a/codec/encoder/core/x86/matrix_transpose.asm
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -34,153 +34,153 @@
 ;in:  m0, m1, m2, m3, m4, m5, m6, m7
 ;out: m0, m3, m5, m2, m7, m1, m6, m4
 %macro TRANSPOSE_8x8B_MMX 10
-	MMX_XSwap bw,  %1, %2, %8
-	MMX_XSwap bw,  %3, %4, %2
-	MMX_XSwap bw,  %5, %6, %4
-	movq	%6, %9
-	movq	%10, %4
-	MMX_XSwap bw,  %7, %6, %4
+    MMX_XSwap bw,  %1, %2, %8
+    MMX_XSwap bw,  %3, %4, %2
+    MMX_XSwap bw,  %5, %6, %4
+    movq    %6, %9
+    movq    %10, %4
+    MMX_XSwap bw,  %7, %6, %4
 
-	MMX_XSwap wd,  %1, %3, %6
-	MMX_XSwap wd,  %8, %2, %3
-	MMX_XSwap wd,  %5, %7, %2
-	movq	%7, %10
-	movq	%10, %3
-	MMX_XSwap wd,  %7, %4, %3
+    MMX_XSwap wd,  %1, %3, %6
+    MMX_XSwap wd,  %8, %2, %3
+    MMX_XSwap wd,  %5, %7, %2
+    movq    %7, %10
+    movq    %10, %3
+    MMX_XSwap wd,  %7, %4, %3
 
-	MMX_XSwap dq,  %1, %5, %4
-	MMX_XSwap dq,  %6, %2, %5
-	MMX_XSwap dq,  %8, %7, %2
-	movq	%7, %10
-	movq	%10, %5
-	MMX_XSwap dq,  %7, %3, %5
+    MMX_XSwap dq,  %1, %5, %4
+    MMX_XSwap dq,  %6, %2, %5
+    MMX_XSwap dq,  %8, %7, %2
+    movq    %7, %10
+    movq    %10, %5
+    MMX_XSwap dq,  %7, %3, %5
 
-	movq	%3, %10
+    movq    %3, %10
 %endmacro
 
 ;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_MMX 2	; dst, dst_stride
-	movq [%1], mm0			; result of line 1, x8 bytes
-	movq [%1+%2], mm3		; result of line 2
-	lea %1, [%1+2*%2]
-	movq [%1], mm5			; result of line 3
-	movq [%1+%2], mm2		; result of line 4
-	lea %1, [%1+2*%2]
-	movq [%1], mm7			; result of line 5
-	movq [%1+%2], mm1		; result of line 6
-	lea %1, [%1+2*%2]
-	movq [%1], mm6			; result of line 7
-	movq [%1+%2], mm4		; result of line 8
+%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
+    movq [%1], mm0          ; result of line 1, x8 bytes
+    movq [%1+%2], mm3       ; result of line 2
+    lea %1, [%1+2*%2]
+    movq [%1], mm5          ; result of line 3
+    movq [%1+%2], mm2       ; result of line 4
+    lea %1, [%1+2*%2]
+    movq [%1], mm7          ; result of line 5
+    movq [%1+%2], mm1       ; result of line 6
+    lea %1, [%1+2*%2]
+    movq [%1], mm6          ; result of line 7
+    movq [%1+%2], mm4       ; result of line 8
 %endmacro
 
 ;in: m0, m3, m5, m2, m7, m1, m6, m4
-%macro TRANSPOSE8x8_WRITE_ALT_MMX 3	; dst, dst_stride, reg32
-	movq [%1], mm0			; result of line 1, x8 bytes
-	movq [%1+%2], mm3		; result of line 2
-	lea %3, [%1+2*%2]
-	movq [%3], mm5			; result of line 3
-	movq [%3+%2], mm2		; result of line 4
-	lea %3, [%3+2*%2]
-	movq [%3], mm7			; result of line 5
-	movq [%3+%2], mm1		; result of line 6
-	lea %3, [%3+2*%2]
-	movq [%3], mm6			; result of line 7
-	movq [%3+%2], mm4		; result of line 8
-%endmacro	; end of TRANSPOSE8x8_WRITE_ALT_MMX
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
+    movq [%1], mm0          ; result of line 1, x8 bytes
+    movq [%1+%2], mm3       ; result of line 2
+    lea %3, [%1+2*%2]
+    movq [%3], mm5          ; result of line 3
+    movq [%3+%2], mm2       ; result of line 4
+    lea %3, [%3+2*%2]
+    movq [%3], mm7          ; result of line 5
+    movq [%3+%2], mm1       ; result of line 6
+    lea %3, [%3+2*%2]
+    movq [%3], mm6          ; result of line 7
+    movq [%3+%2], mm4       ; result of line 8
+%endmacro   ; end of TRANSPOSE8x8_WRITE_ALT_MMX
 
 ; for transpose 16x8
 
 ;in:  m0, m1, m2, m3, m4, m5, m6, m7
 ;out: m4, m2, m3, m7, m5, m1, m6, m0
-%macro TRANSPOSE_8x16B_SSE2		10
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%10, %4
-	SSE2_XSawp bw,  %7, %6, %4
+%macro TRANSPOSE_8x16B_SSE2     10
+    SSE2_XSawp bw,  %1, %2, %8
+    SSE2_XSawp bw,  %3, %4, %2
+    SSE2_XSawp bw,  %5, %6, %4
+    movdqa  %6, %9
+    movdqa  %10, %4
+    SSE2_XSawp bw,  %7, %6, %4
 
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %10
-	movdqa	%10, %3
-	SSE2_XSawp wd,  %7, %4, %3
+    SSE2_XSawp wd,  %1, %3, %6
+    SSE2_XSawp wd,  %8, %2, %3
+    SSE2_XSawp wd,  %5, %7, %2
+    movdqa  %7, %10
+    movdqa  %10, %3
+    SSE2_XSawp wd,  %7, %4, %3
 
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %10
-	movdqa	%10, %5
-	SSE2_XSawp dq,  %7, %3, %5
+    SSE2_XSawp dq,  %1, %5, %4
+    SSE2_XSawp dq,  %6, %2, %5
+    SSE2_XSawp dq,  %8, %7, %2
+    movdqa  %7, %10
+    movdqa  %10, %5
+    SSE2_XSawp dq,  %7, %3, %5
 
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %10
-	movdqa	%10, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %10
-%endmacro	; end of TRANSPOSE_8x16B_SSE2
+    SSE2_XSawp qdq,  %1, %8, %3
+    SSE2_XSawp qdq,  %4, %2, %8
+    SSE2_XSawp qdq,  %6, %7, %2
+    movdqa  %7, %10
+    movdqa  %10, %1
+    SSE2_XSawp qdq,  %7, %5, %1
+    movdqa  %5, %10
+%endmacro   ; end of TRANSPOSE_8x16B_SSE2
 
 
-%macro TRANSPOSE8x16_WRITE_SSE2	2	; dst, dst_stride
-	movq [%1], xmm4			; result of line 1, x8 bytes
-	movq [%1+%2], xmm2		; result of line 2
-	lea %1, [%1+2*%2]
-	movq [%1], xmm3			; result of line 3
-	movq [%1+%2], xmm7		; result of line 4
+%macro TRANSPOSE8x16_WRITE_SSE2 2   ; dst, dst_stride
+    movq [%1], xmm4         ; result of line 1, x8 bytes
+    movq [%1+%2], xmm2      ; result of line 2
+    lea %1, [%1+2*%2]
+    movq [%1], xmm3         ; result of line 3
+    movq [%1+%2], xmm7      ; result of line 4
 
-	lea %1, [%1+2*%2]
-	movq [%1], xmm5			; result of line 5
-	movq [%1+%2], xmm1		; result of line 6
-	lea %1, [%1+2*%2]
-	movq [%1], xmm6			; result of line 7
-	movq [%1+%2], xmm0		; result of line 8
+    lea %1, [%1+2*%2]
+    movq [%1], xmm5         ; result of line 5
+    movq [%1+%2], xmm1      ; result of line 6
+    lea %1, [%1+2*%2]
+    movq [%1], xmm6         ; result of line 7
+    movq [%1+%2], xmm0      ; result of line 8
 
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm4		; result of line 9
-	movhpd [%1+%2], xmm2	; result of line 10
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm3		; result of line 11
-	movhpd [%1+%2], xmm7	; result of line 12
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm4       ; result of line 9
+    movhpd [%1+%2], xmm2    ; result of line 10
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm3       ; result of line 11
+    movhpd [%1+%2], xmm7    ; result of line 12
 
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm5		; result of line 13
-	movhpd [%1+%2], xmm1	; result of line 14
-	lea %1, [%1+2*%2]
-	movhpd [%1], xmm6		; result of line 15
-	movhpd [%1+%2], xmm0	; result of line 16
-%endmacro	; end of TRANSPOSE_WRITE_RESULT_SSE2
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm5       ; result of line 13
+    movhpd [%1+%2], xmm1    ; result of line 14
+    lea %1, [%1+2*%2]
+    movhpd [%1], xmm6       ; result of line 15
+    movhpd [%1+%2], xmm0    ; result of line 16
+%endmacro   ; end of TRANSPOSE_WRITE_RESULT_SSE2
 
-%macro TRANSPOSE8x16_WRITE_ALT_SSE2	3	; dst, dst_stride, reg32
-	movq [%1], xmm4			; result of line 1, x8 bytes
-	movq [%1+%2], xmm2		; result of line 2
-	lea %3, [%1+2*%2]
-	movq [%3], xmm3			; result of line 3
-	movq [%3+%2], xmm7		; result of line 4
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3   ; dst, dst_stride, reg32
+    movq [%1], xmm4         ; result of line 1, x8 bytes
+    movq [%1+%2], xmm2      ; result of line 2
+    lea %3, [%1+2*%2]
+    movq [%3], xmm3         ; result of line 3
+    movq [%3+%2], xmm7      ; result of line 4
 
-	lea %3, [%3+2*%2]
-	movq [%3], xmm5			; result of line 5
-	movq [%3+%2], xmm1		; result of line 6
-	lea %3, [%3+2*%2]
-	movq [%3], xmm6			; result of line 7
-	movq [%3+%2], xmm0		; result of line 8
+    lea %3, [%3+2*%2]
+    movq [%3], xmm5         ; result of line 5
+    movq [%3+%2], xmm1      ; result of line 6
+    lea %3, [%3+2*%2]
+    movq [%3], xmm6         ; result of line 7
+    movq [%3+%2], xmm0      ; result of line 8
 
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm4		; result of line 9
-	movhpd [%3+%2], xmm2	; result of line 10
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm3		; result of line 11
-	movhpd [%3+%2], xmm7	; result of line 12
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm4       ; result of line 9
+    movhpd [%3+%2], xmm2    ; result of line 10
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm3       ; result of line 11
+    movhpd [%3+%2], xmm7    ; result of line 12
 
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm5		; result of line 13
-	movhpd [%3+%2], xmm1	; result of line 14
-	lea %3, [%3+2*%2]
-	movhpd [%3], xmm6		; result of line 15
-	movhpd [%3+%2], xmm0	; result of line 16
-%endmacro	; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm5       ; result of line 13
+    movhpd [%3+%2], xmm1    ; result of line 14
+    lea %3, [%3+2*%2]
+    movhpd [%3], xmm6       ; result of line 15
+    movhpd [%3+%2], xmm0    ; result of line 16
+%endmacro   ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
 
 
 SECTION .text
@@ -187,209 +187,209 @@
 
 WELS_EXTERN TransposeMatrixBlock16x16_sse2
 ; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
-	push r4
-	push r5
-	%assign push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
+    push r4
+    push r5
+    %assign push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
 
-	mov r4, r7
-	and r4, 0Fh
-	sub r7, 10h
-	sub r7, r4
-	lea r5, [r3+r3*2]
-	; top 8x16 block
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+r3*2]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+r3*4]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+r3*2]
+    mov r4, r7
+    and r4, 0Fh
+    sub r7, 10h
+    sub r7, r4
+    lea r5, [r3+r3*2]
+    ; top 8x16 block
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+r3*2]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+r3*4]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+r3*2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
 
-	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+    TRANSPOSE8x16_WRITE_SSE2        r0, r1
 
-	; bottom 8x16 block
-	lea	r2, [r2+r3*4]
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+r3*2]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+r3*4]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+r3*2]
+    ; bottom 8x16 block
+    lea r2, [r2+r3*4]
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    movdqa xmm2, [r2+r3*2]
+    movdqa xmm3, [r2+r5]
+    lea r2, [r2+r3*4]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    movdqa xmm6, [r2+r3*2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
 
-	mov r5, r1
-	sal r5, 4
-	sub r0, r5
-	lea r0, [r0+r1*2+8]
-	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+    mov r5, r1
+    sal r5, 4
+    sub r0, r5
+    lea r0, [r0+r1*2+8]
+    TRANSPOSE8x16_WRITE_SSE2        r0, r1
 
-	add r7, r4
-	add r7, 10h
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
+    add r7, r4
+    add r7, 10h
+    POP_XMM
+    LOAD_4_PARA_POP
+    pop r5
+    pop r4
+    ret
 
 WELS_EXTERN TransposeMatrixBlocksx16_sse2
 ; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
-	push r5
-	push r6
-	%assign push_num 2
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION  r1, r1d
-	SIGN_EXTENSION  r3, r3d
-	SIGN_EXTENSION  r4, r4d
-	mov r5, r7
-	and r5, 0Fh
-	sub r7, 10h
-	sub r7, r5
+    push r5
+    push r6
+    %assign push_num 2
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    mov r5, r7
+    and r5, 0Fh
+    sub r7, 10h
+    sub r7, r5
 TRANSPOSE_LOOP_SSE2:
-	; explictly loading next loop data
-	lea	r6, [r2+r3*8]
-	push r4
+    ; explictly loading next loop data
+    lea r6, [r2+r3*8]
+    push r4
 %rep 8
-	mov	r4, [r6]
-	mov	r4, [r6+r3]
-	lea	r6, [r6+r3*2]
+    mov r4, [r6]
+    mov r4, [r6+r3]
+    lea r6, [r6+r3*2]
 %endrep
-	pop r4
-	; top 8x16 block
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm2, [r2]
-	movdqa xmm3, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm6, [r2]
+    pop r4
+    ; top 8x16 block
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm2, [r2]
+    movdqa xmm3, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
-	TRANSPOSE8x16_WRITE_ALT_SSE2		r0, r1, r6
-	lea	r2, [r2+r3*2]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+    TRANSPOSE8x16_WRITE_ALT_SSE2        r0, r1, r6
+    lea r2, [r2+r3*2]
 
-	; bottom 8x16 block
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	lea	r2, [r2+r3*2]
-	movdqa xmm2, [r2]
-	movdqa xmm3, [r2+r3]
-	lea r2, [r2+r3*2]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	lea	r2, [r2+r3*2]
-	movdqa xmm6, [r2]
+    ; bottom 8x16 block
+    movdqa xmm0, [r2]
+    movdqa xmm1, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm2, [r2]
+    movdqa xmm3, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm4, [r2]
+    movdqa xmm5, [r2+r3]
+    lea r2, [r2+r3*2]
+    movdqa xmm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m4, m2, m3, m7, m5, m1, m6, m0
-	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
-	TRANSPOSE8x16_WRITE_ALT_SSE2		r0+8, r1, r6
-	lea	r2, [r2+r3*2]
-	lea r0, [r0+16]
-	dec r4
-	jg near TRANSPOSE_LOOP_SSE2
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m4, m2, m3, m7, m5, m1, m6, m0
+    TRANSPOSE_8x16B_SSE2    xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+    TRANSPOSE8x16_WRITE_ALT_SSE2        r0+8, r1, r6
+    lea r2, [r2+r3*2]
+    lea r0, [r0+16]
+    dec r4
+    jg near TRANSPOSE_LOOP_SSE2
 
-	add r7, r5
-	add r7, 10h
-	POP_XMM
-	LOAD_5_PARA_POP
-	pop r6
-	pop r5
-	ret
+    add r7, r5
+    add r7, 10h
+    POP_XMM
+    LOAD_5_PARA_POP
+    pop r6
+    pop r5
+    ret
 
 WELS_EXTERN TransposeMatrixBlock8x8_mmx
 ; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
-	%assign push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION  r1, r1d
-	SIGN_EXTENSION  r3, r3d
-	sub	r7, 8
+    %assign push_num 0
+    LOAD_4_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    sub r7, 8
 
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m0, m3, m5, m2, m7, m1, m6, m4
-	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m0, m3, m5, m2, m7, m1, m6, m4
+    TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
 
-	TRANSPOSE8x8_WRITE_MMX r0, r1
+    TRANSPOSE8x8_WRITE_MMX r0, r1
 
-	emms
-	add r7, 8
-	LOAD_4_PARA_POP
-	ret
+    emms
+    add r7, 8
+    LOAD_4_PARA_POP
+    ret
 
 WELS_EXTERN TransposeMatrixBlocksx8_mmx
 ; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
-	push r5
-	push r6
-	%assign push_num 2
-	LOAD_5_PARA
-	SIGN_EXTENSION  r1, r1d
-	SIGN_EXTENSION  r3, r3d
-	SIGN_EXTENSION  r4, r4d
-	sub	r7, 8
+    push r5
+    push r6
+    %assign push_num 2
+    LOAD_5_PARA
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    SIGN_EXTENSION  r4, r4d
+    sub r7, 8
 
-	lea	r5, [r2+r3*8]
+    lea r5, [r2+r3*8]
 
 TRANSPOSE_BLOCKS_X8_LOOP_MMX:
-	; explictly loading next loop data
+    ; explictly loading next loop data
 %rep 4
-	mov r6, [r5]
-	mov r6, [r5+r3]
-	lea	r5, [r5+r3*2]
+    mov r6, [r5]
+    mov r6, [r5+r3]
+    lea r5, [r5+r3*2]
 %endrep
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
+    movq mm0, [r2]
+    movq mm1, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm2, [r2]
+    movq mm3, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm4, [r2]
+    movq mm5, [r2+r3]
+    lea r2, [r2+2*r3]
+    movq mm6, [r2]
 
-	;in:  m0, m1, m2, m3, m4, m5, m6, m7
-	;out: m0, m3, m5, m2, m7, m1, m6, m4
-	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+    ;in:  m0, m1, m2, m3, m4, m5, m6, m7
+    ;out: m0, m3, m5, m2, m7, m1, m6, m4
+    TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
 
-	TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
-	lea r0, [r0+8]
-	lea r2, [r2+2*r3]
-	dec r4
-	jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+    TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+    lea r0, [r0+8]
+    lea r2, [r2+2*r3]
+    dec r4
+    jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
 
-	emms
-	add r7, 8
-	LOAD_5_PARA_POP
-	pop r6
-	pop r5
-	ret
+    emms
+    add r7, 8
+    LOAD_5_PARA_POP
+    pop r6
+    pop r5
+    ret
--- a/codec/encoder/core/x86/memzero.asm
+++ b/codec/encoder/core/x86/memzero.asm
@@ -51,10 +51,10 @@
 ;void WelsPrefetchZero_mmx(int8_t const*_A);
 ;***********************************************************************
 WELS_EXTERN WelsPrefetchZero_mmx
-	%assign  push_num 0
-	LOAD_1_PARA
-	prefetchnta [r0]
-	ret
+    %assign  push_num 0
+    LOAD_1_PARA
+    prefetchnta [r0]
+    ret
 
 
 ;***********************************************************************
@@ -62,23 +62,23 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroAligned64_sse2
 
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    neg     r1
 
-		pxor	xmm0,		xmm0
+    pxor    xmm0,       xmm0
 .memzeroa64_sse2_loops:
-		movdqa	[r0],		xmm0
-		movdqa	[r0+16],	xmm0
-		movdqa	[r0+32],	xmm0
-		movdqa	[r0+48],	xmm0
-		add		r0, 0x40
+    movdqa  [r0],       xmm0
+    movdqa  [r0+16],    xmm0
+    movdqa  [r0+32],    xmm0
+    movdqa  [r0+48],    xmm0
+    add     r0, 0x40
 
-		add r1, 0x40
-		jnz near .memzeroa64_sse2_loops
+    add r1, 0x40
+    jnz near .memzeroa64_sse2_loops
 
-		ret
+    ret
 
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -85,28 +85,28 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize64_mmx
 
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    neg     r1
 
-		pxor	mm0,		mm0
+    pxor    mm0,        mm0
 .memzero64_mmx_loops:
-		movq	[r0],		mm0
-		movq	[r0+8],	mm0
-		movq	[r0+16],	mm0
-		movq	[r0+24],	mm0
-		movq	[r0+32],	mm0
-		movq	[r0+40],	mm0
-		movq	[r0+48],	mm0
-		movq	[r0+56],	mm0
-		add		r0,		0x40
+    movq    [r0],       mm0
+    movq    [r0+8], mm0
+    movq    [r0+16],    mm0
+    movq    [r0+24],    mm0
+    movq    [r0+32],    mm0
+    movq    [r0+40],    mm0
+    movq    [r0+48],    mm0
+    movq    [r0+56],    mm0
+    add     r0,     0x40
 
-		add r1, 0x40
-		jnz near .memzero64_mmx_loops
+    add r1, 0x40
+    jnz near .memzero64_mmx_loops
 
-		WELSEMMS
-		ret
+    WELSEMMS
+    ret
 
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
@@ -113,20 +113,20 @@
 ;***********************************************************************
 WELS_EXTERN WelsSetMemZeroSize8_mmx
 
-		%assign  push_num 0
-		LOAD_2_PARA
-		SIGN_EXTENSION r1, r1d
-		neg		r1
-		pxor	mm0,		mm0
+    %assign  push_num 0
+    LOAD_2_PARA
+    SIGN_EXTENSION r1, r1d
+    neg     r1
+    pxor    mm0,        mm0
 
 .memzero8_mmx_loops:
-		movq	[r0],		mm0
-		add		r0,		0x08
+    movq    [r0],       mm0
+    add     r0,     0x08
 
-		add		r1,		0x08
-		jnz near .memzero8_mmx_loops
+    add     r1,     0x08
+    jnz near .memzero8_mmx_loops
 
-		WELSEMMS
-		ret
+    WELSEMMS
+    ret
 
 
--- a/codec/encoder/core/x86/quant.asm
+++ b/codec/encoder/core/x86/quant.asm
@@ -49,140 +49,140 @@
 ;************************************************
 
 %macro SSE2_Quant8  5
-		MOVDQ	%1, %5
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pxor	%1, %2
-		psubw	%1, %2
-		MOVDQ	%5, %1
+    MOVDQ   %1, %5
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    paddusw %1, %3
+    pmulhuw %1, %4
+    pxor    %1, %2
+    psubw   %1, %2
+    MOVDQ   %5, %1
 %endmacro
 
 %macro SSE2_QuantMax8  6
-		MOVDQ	%1, %5
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pmaxsw	%6, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		MOVDQ	%5, %1
+    MOVDQ   %1, %5
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    paddusw %1, %3
+    pmulhuw %1, %4
+    pmaxsw  %6, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    MOVDQ   %5, %1
 %endmacro
 
-%define pDct				esp + 4
-%define ff					esp + 8
-%define mf					esp + 12
-%define max					esp + 16
+%define pDct                esp + 4
+%define ff                  esp + 8
+%define mf                  esp + 12
+%define max                 esp + 16
 ;***********************************************************************
-;	void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;   void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuant4x4_sse2
-		%assign push_num 0
-                LOAD_3_PARA
-		movdqa	xmm2, [r1]
-		movdqa	xmm3, [r2]
+    %assign push_num 0
+    LOAD_3_PARA
+    movdqa  xmm2, [r1]
+    movdqa  xmm3, [r2]
 
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
 
-		ret
+    ret
 
 ;***********************************************************************
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuant4x4Dc_sse2
- 		%assign push_num 0
-		LOAD_3_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		SSE2_Copy8Times xmm3, r2d
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSIONW r1, r1w
+    SIGN_EXTENSIONW r2, r2w
+    SSE2_Copy8Times xmm3, r2d
 
-		SSE2_Copy8Times xmm2, r1d
+    SSE2_Copy8Times xmm2, r1d
 
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
 
-		ret
+    ret
 
 ;***********************************************************************
-;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
+;   void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4_sse2
-		%assign push_num 0
-		LOAD_3_PARA
-		MOVDQ	xmm2, [r1]
-		MOVDQ	xmm3, [r2]
+    %assign push_num 0
+    LOAD_3_PARA
+    MOVDQ   xmm2, [r1]
+    MOVDQ   xmm3, [r2]
 
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+    SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
 
-		ret
+    ret
 
 ;***********************************************************************
-;	void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
+;   void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f,  int16_t *mf, int16_t *max);
 ;***********************************************************************
 WELS_EXTERN WelsQuantFour4x4Max_sse2
-		%assign push_num 0
-		LOAD_4_PARA
-		PUSH_XMM 8
-		MOVDQ	xmm2, [r1]
-		MOVDQ	xmm3, [r2]
+    %assign push_num 0
+    LOAD_4_PARA
+    PUSH_XMM 8
+    MOVDQ   xmm2, [r1]
+    MOVDQ   xmm3, [r2]
 
-		pxor	xmm4, xmm4
-		pxor	xmm5, xmm5
-		pxor	xmm6, xmm6
-		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0	  ], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
+    pxor    xmm4, xmm4
+    pxor    xmm5, xmm5
+    pxor    xmm6, xmm6
+    pxor    xmm7, xmm7
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0   ], xmm4
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+    SSE2_QuantMax8  xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
 
-		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4
-		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7
-		movdqa	xmm1,  xmm0
-		punpckhqdq	xmm0, xmm1
-		pmaxsw	xmm0, xmm1
+    SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
+    pmaxsw  xmm0,  xmm4
+    pmaxsw  xmm0,  xmm5
+    pmaxsw  xmm0,  xmm7
+    movdqa  xmm1,  xmm0
+    punpckhqdq  xmm0, xmm1
+    pmaxsw  xmm0, xmm1
 
-		movq	[r3], xmm0
-		POP_XMM
-		LOAD_4_PARA_POP
-		ret
+    movq    [r3], xmm0
+    POP_XMM
+    LOAD_4_PARA_POP
+    ret
 
-%macro  MMX_Copy4Times 2
-		movd		%1, %2
-		punpcklwd	%1, %1
-		punpckldq	%1,	%1
+%macro MMX_Copy4Times 2
+    movd        %1, %2
+    punpcklwd   %1, %1
+    punpckldq   %1, %1
 %endmacro
 
 SECTION .text
 
 %macro MMX_Quant4  4
-		pxor	%2, %2
-		pcmpgtw	%2, %1
-		pxor	%1, %2
-		psubw	%1, %2
-		paddusw	%1, %3
-		pmulhuw	%1, %4
-		pxor	%1, %2
-		psubw	%1, %2
+    pxor    %2, %2
+    pcmpgtw %2, %1
+    pxor    %1, %2
+    psubw   %1, %2
+    paddusw %1, %3
+    pmulhuw %1, %4
+    pxor    %1, %2
+    psubw   %1, %2
 %endmacro
 
 ;***********************************************************************
@@ -189,101 +189,101 @@
 ;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
 ;***********************************************************************
 WELS_EXTERN WelsHadamardQuant2x2_mmx
-		%assign push_num 0
-		LOAD_5_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		movd		mm0,			[r0]
-		movd		mm1,			[r0 + 0x20]
-		punpcklwd	mm0,			mm1
-		movd		mm3,			[r0 + 0x40]
-		movd		mm1,			[r0 + 0x60]
-		punpcklwd	mm3,			mm1
+    %assign push_num 0
+    LOAD_5_PARA
+    SIGN_EXTENSIONW r1, r1w
+    SIGN_EXTENSIONW r2, r2w
+    movd        mm0,            [r0]
+    movd        mm1,            [r0 + 0x20]
+    punpcklwd   mm0,            mm1
+    movd        mm3,            [r0 + 0x40]
+    movd        mm1,            [r0 + 0x60]
+    punpcklwd   mm3,            mm1
 
-		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
-		movq		mm5,			mm3
-		paddw		mm3,			mm0
-		psubw		mm0,			mm5
-		punpcklwd	mm3,			mm0
-		movq		mm1,			mm3
-		psrlq		mm1,			32
-		movq		mm5,			mm1
-		paddw		mm1,			mm3
-		psubw		mm3,			mm5
-		punpcklwd	mm1,			mm3
+    ;hdm_2x2,   mm0 = dct0 dct1, mm3 = dct2 dct3
+    movq        mm5,            mm3
+    paddw       mm3,            mm0
+    psubw       mm0,            mm5
+    punpcklwd   mm3,            mm0
+    movq        mm1,            mm3
+    psrlq       mm1,            32
+    movq        mm5,            mm1
+    paddw       mm1,            mm3
+    psubw       mm3,            mm5
+    punpcklwd   mm1,            mm3
 
-		;quant_2x2_dc
-		MMX_Copy4Times	mm3,		r2d
-		MMX_Copy4Times	mm2,		r1d
-		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+    ;quant_2x2_dc
+    MMX_Copy4Times  mm3,        r2d
+    MMX_Copy4Times  mm2,        r1d
+    MMX_Quant4      mm1,    mm0,    mm2,    mm3
 
-		; store dct_2x2
-		movq		[r3],			mm1
-		movq		[r4],			mm1
+    ; store dct_2x2
+    movq        [r3],           mm1
+    movq        [r4],           mm1
 
-		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF
-		pxor		mm3,			mm3
-		packsswb	mm1,			mm3
-		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
-		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
-		psadbw		mm1,			mm3		;
-		mov			r1w,				0
-		mov			[r0],			r1w
-		mov			[r0 + 0x20],	r1w
-		mov			[r0 + 0x40],	r1w
-		mov			[r0 + 0x60],	r1w
+    ; pNonZeroCount of dct_2x2
+    pcmpeqb     mm2,            mm2     ; mm2 = FF
+    pxor        mm3,            mm3
+    packsswb    mm1,            mm3
+    pcmpeqb     mm1,            mm3     ; set FF if equal, 0 if not equal
+    psubsb      mm1,            mm2     ; set 0 if equal, 1 if not equal
+    psadbw      mm1,            mm3     ;
+    mov         r1w,                0
+    mov         [r0],           r1w
+    mov         [r0 + 0x20],    r1w
+    mov         [r0 + 0x40],    r1w
+    mov         [r0 + 0x60],    r1w
 
 
-		movd		retrd,		mm1
+    movd        retrd,      mm1
 
-		WELSEMMS
-		LOAD_5_PARA_POP
-		ret
+    WELSEMMS
+    LOAD_5_PARA_POP
+    ret
 
 ;***********************************************************************
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
-		%assign push_num 0
-		LOAD_3_PARA
-		SIGN_EXTENSIONW r1, r1w
-		SIGN_EXTENSIONW r2, r2w
-		movd		mm0,			[r0]
-		movd		mm1,			[r0 + 0x20]
-		punpcklwd	mm0,			mm1
-		movd		mm3,			[r0 + 0x40]
-		movd		mm1,			[r0 + 0x60]
-		punpcklwd	mm3,			mm1
+    %assign push_num 0
+    LOAD_3_PARA
+    SIGN_EXTENSIONW r1, r1w
+    SIGN_EXTENSIONW r2, r2w
+    movd        mm0,            [r0]
+    movd        mm1,            [r0 + 0x20]
+    punpcklwd   mm0,            mm1
+    movd        mm3,            [r0 + 0x40]
+    movd        mm1,            [r0 + 0x60]
+    punpcklwd   mm3,            mm1
 
-		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
-		movq		mm5,			mm3
-		paddw		mm3,			mm0
-		psubw		mm0,			mm5
-		punpcklwd	mm3,			mm0
-		movq		mm1,			mm3
-		psrlq		mm1,			32
-		movq		mm5,			mm1
-		paddw		mm1,			mm3
-		psubw		mm3,			mm5
-		punpcklwd	mm1,			mm3
+    ;hdm_2x2,   mm0 = dct0 dct1, mm3 = dct2 dct3
+    movq        mm5,            mm3
+    paddw       mm3,            mm0
+    psubw       mm0,            mm5
+    punpcklwd   mm3,            mm0
+    movq        mm1,            mm3
+    psrlq       mm1,            32
+    movq        mm5,            mm1
+    paddw       mm1,            mm3
+    psubw       mm3,            mm5
+    punpcklwd   mm1,            mm3
 
-		;quant_2x2_dc
-		MMX_Copy4Times	mm3,		r2d
-		MMX_Copy4Times	mm2,		r1d
-		MMX_Quant4		mm1,	mm0,	mm2,	mm3
+    ;quant_2x2_dc
+    MMX_Copy4Times  mm3,        r2d
+    MMX_Copy4Times  mm2,        r1d
+    MMX_Quant4      mm1,    mm0,    mm2,    mm3
 
-		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF
-		pxor		mm3,			mm3
-		packsswb	mm1,			mm3
-		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
-		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
-		psadbw		mm1,			mm3		;
-		movd		retrd,			mm1
+    ; pNonZeroCount of dct_2x2
+    pcmpeqb     mm2,            mm2     ; mm2 = FF
+    pxor        mm3,            mm3
+    packsswb    mm1,            mm3
+    pcmpeqb     mm1,            mm3     ; set FF if equal, 0 if not equal
+    psubsb      mm1,            mm2     ; set 0 if equal, 1 if not equal
+    psadbw      mm1,            mm3     ;
+    movd        retrd,          mm1
 
-		WELSEMMS
-		ret
+    WELSEMMS
+    ret
 
 
 %macro SSE2_DeQuant8 3
@@ -297,12 +297,12 @@
 ; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************
 WELS_EXTERN WelsDequant4x4_sse2
-	%assign push_num 0
-	LOAD_2_PARA
+    %assign push_num 0
+    LOAD_2_PARA
 
-	movdqa  xmm1, [r1]
-	SSE2_DeQuant8 [r0	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
+    movdqa  xmm1, [r1]
+    SSE2_DeQuant8 [r0   ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0 + 0x10],  xmm0, xmm1
 
     ret
 
@@ -311,18 +311,18 @@
 ;***********************************************************************====
 
 WELS_EXTERN WelsDequantFour4x4_sse2
-	%assign push_num 0
-	LOAD_2_PARA
+    %assign push_num 0
+    LOAD_2_PARA
 
-	movdqa  xmm1, [r1]
-	SSE2_DeQuant8 [r0	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x10	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x20	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x30	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x40	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x50	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x60	],  xmm0, xmm1
-	SSE2_DeQuant8 [r0+0x70	],  xmm0, xmm1
+    movdqa  xmm1, [r1]
+    SSE2_DeQuant8 [r0   ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x10  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x20  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x30  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x40  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x50  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x60  ],  xmm0, xmm1
+    SSE2_DeQuant8 [r0+0x70  ],  xmm0, xmm1
 
     ret
 
@@ -330,41 +330,41 @@
 ;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
 ;***********************************************************************
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
-		%assign push_num 0
-		LOAD_2_PARA
-		%ifndef X86_32
-		movzx r1, r1w
-		%endif
+    %assign push_num 0
+    LOAD_2_PARA
+    %ifndef X86_32
+    movzx r1, r1w
+    %endif
 
-		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		r1d
-		;psrlw		xmm1,		2		; for the (>>2) in ihdm
-		MOVDQ		xmm0,		[r0]
-		MOVDQ		xmm2,		[r0+0x10]
-		pmullw		xmm0,		xmm1
-		pmullw		xmm2,		xmm1
+    ; WelsDequantLumaDc4x4
+    SSE2_Copy8Times xmm1,       r1d
+    ;psrlw      xmm1,       2       ; for the (>>2) in ihdm
+    MOVDQ       xmm0,       [r0]
+    MOVDQ       xmm2,       [r0+0x10]
+    pmullw      xmm0,       xmm1
+    pmullw      xmm2,       xmm1
 
-		; ihdm_4x4
-		movdqa		xmm1,		xmm0
-		psrldq		xmm1,		8
-		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8
+    ; ihdm_4x4
+    movdqa      xmm1,       xmm0
+    psrldq      xmm1,       8
+    movdqa      xmm3,       xmm2
+    psrldq      xmm3,       8
 
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
-		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
-		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
+    SSE2_SumSub     xmm0, xmm3, xmm5                    ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+    SSE2_SumSub     xmm1, xmm2, xmm5                    ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
+    SSE2_SumSub     xmm3, xmm2, xmm5                    ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
+    SSE2_SumSub     xmm0, xmm1, xmm5                    ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
 
-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
-		SSE2_SumSub		xmm2, xmm4,	xmm5
-		SSE2_SumSub		xmm1, xmm0, xmm5
-		SSE2_SumSub		xmm4, xmm0, xmm5
-		SSE2_SumSub		xmm2, xmm1, xmm5
-		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
+    SSE2_TransTwo4x4W   xmm2, xmm1, xmm3, xmm0, xmm4
+    SSE2_SumSub     xmm2, xmm4, xmm5
+    SSE2_SumSub     xmm1, xmm0, xmm5
+    SSE2_SumSub     xmm4, xmm0, xmm5
+    SSE2_SumSub     xmm2, xmm1, xmm5
+    SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
 
-		punpcklqdq	xmm0,		xmm1
-		MOVDQ		[r0],		xmm0
+    punpcklqdq  xmm0,       xmm1
+    MOVDQ       [r0],       xmm0
 
-		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[r0+16],	xmm2
-		ret
+    punpcklqdq  xmm2,       xmm3
+    MOVDQ       [r0+16],    xmm2
+    ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -35,123 +35,123 @@
 
 ;**********************************************************************************************************************************
 ;
-;	uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+;   uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
 ;
-;	\note:
-;		src need align with 16 bytes, ref is optional
-;	\return value:
-;		return minimal SAD cost, according index carried by index_min_cost
+;   \note:
+;       src need align with 16 bytes, ref is optional
+;   \return value:
+;       return minimal SAD cost, according index carried by index_min_cost
 ;**********************************************************************************************************************************
 ; try 8 mv via offset
 ; xmm7 store sad costs
-%macro   SAD_16x16_LINE_SSE41  4	; src, ref, stride_src, stride_ref
-    movdqa		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqu		xmm2, [%2+8h]
-    movdqa		xmm3, xmm1
-    movdqa		xmm4, xmm2
+%macro SAD_16x16_LINE_SSE41  4  ; src, ref, stride_src, stride_ref
+    movdqa      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqu      xmm2, [%2+8h]
+    movdqa      xmm3, xmm1
+    movdqa      xmm4, xmm2
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm3, xmm0, 5	; 101 B
-    paddw		xmm7, xmm3		; accumulate cost
+    mpsadbw     xmm3, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm3      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 2	; 010 B
-    paddw		xmm7, xmm2		; accumulate cost
+    mpsadbw     xmm2, xmm0, 2   ; 010 B
+    paddw       xmm7, xmm2      ; accumulate cost
 
-    mpsadbw		xmm4, xmm0, 7	; 111 B
-    paddw		xmm7, xmm4		; accumulate cost
+    mpsadbw     xmm4, xmm0, 7   ; 111 B
+    paddw       xmm7, xmm4      ; accumulate cost
 
-    add			%1, %3
-    add			%2, %4
-%endmacro	; end of SAD_16x16_LINE_SSE41
-%macro   SAD_16x16_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
-    movdqa		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqu		xmm2, [%2+8h]
-    movdqa		xmm3, xmm1
-    movdqa		xmm4, xmm2
+    add         %1, %3
+    add         %2, %4
+%endmacro   ; end of SAD_16x16_LINE_SSE41
+%macro SAD_16x16_LINE_SSE41E  4 ; src, ref, stride_src, stride_ref
+    movdqa      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqu      xmm2, [%2+8h]
+    movdqa      xmm3, xmm1
+    movdqa      xmm4, xmm2
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm3, xmm0, 5	; 101 B
-    paddw		xmm7, xmm3		; accumulate cost
+    mpsadbw     xmm3, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm3      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 2	; 010 B
-    paddw		xmm7, xmm2		; accumulate cost
+    mpsadbw     xmm2, xmm0, 2   ; 010 B
+    paddw       xmm7, xmm2      ; accumulate cost
 
-    mpsadbw		xmm4, xmm0, 7	; 111 B
-    paddw		xmm7, xmm4		; accumulate cost
-%endmacro	; end of SAD_16x16_LINE_SSE41E
+    mpsadbw     xmm4, xmm0, 7   ; 111 B
+    paddw       xmm7, xmm4      ; accumulate cost
+%endmacro   ; end of SAD_16x16_LINE_SSE41E
 
 WELS_EXTERN SampleSad16x16Hor8_sse41
     ;push ebx
     ;push esi
-    ;mov eax, [esp+12]	;   src
-    ;mov ecx, [esp+16]	;   stride_src
-    ;mov ebx, [esp+20]	;   ref
-    ;mov edx, [esp+24]	;   stride_ref
-    ;mov esi, [esp+28]	;   base_cost
+    ;mov eax, [esp+12]  ;   src
+    ;mov ecx, [esp+16]  ;   stride_src
+    ;mov ebx, [esp+20]  ;   ref
+    ;mov edx, [esp+24]  ;   stride_ref
+    ;mov esi, [esp+28]  ;   base_cost
     %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-    SIGN_EXTENSION	r1, r1d
-    SIGN_EXTENSION	r3, r3d
-    pxor	xmm7,	xmm7
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    pxor    xmm7,   xmm7
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
 
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
-    SAD_16x16_LINE_SSE41E	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41    r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41E   r0, r2, r1, r3
 
-    pxor	xmm0,	xmm0
-    movdqa	xmm6,	xmm7
-    punpcklwd	xmm6,	xmm0
-    punpckhwd	xmm7,	xmm0
+    pxor    xmm0,   xmm0
+    movdqa  xmm6,   xmm7
+    punpcklwd   xmm6,   xmm0
+    punpckhwd   xmm7,   xmm0
 
-    movdqa	xmm5,	[r4]
-    movdqa	xmm4,	xmm5
-    punpcklwd	xmm4,	xmm0
-    punpckhwd	xmm5,	xmm0
+    movdqa  xmm5,   [r4]
+    movdqa  xmm4,   xmm5
+    punpcklwd   xmm4,   xmm0
+    punpckhwd   xmm5,   xmm0
 
-    paddd	xmm4,	xmm6
-    paddd	xmm5,	xmm7
-    movdqa	xmm3,	xmm4
-    pminud	xmm3,	xmm5
-    pshufd	xmm2,	xmm3,	01001110B
-    pminud	xmm2,	xmm3
-    pshufd	xmm3,	xmm2,	10110001B
-    pminud	xmm2,	xmm3
-    movd	retrd,	xmm2
-    pcmpeqd	xmm4,	xmm2
-    movmskps	r2d, xmm4
-    bsf		r1d,	r2d
-    jnz	near WRITE_INDEX
+    paddd   xmm4,   xmm6
+    paddd   xmm5,   xmm7
+    movdqa  xmm3,   xmm4
+    pminud  xmm3,   xmm5
+    pshufd  xmm2,   xmm3,   01001110B
+    pminud  xmm2,   xmm3
+    pshufd  xmm3,   xmm2,   10110001B
+    pminud  xmm2,   xmm3
+    movd    retrd,  xmm2
+    pcmpeqd xmm4,   xmm2
+    movmskps    r2d, xmm4
+    bsf     r1d,    r2d
+    jnz near WRITE_INDEX
 
-    pcmpeqd	xmm5,	xmm2
-    movmskps	r2d, xmm5
-    bsf		r1d,	r2d
-    add		r1d,	4
+    pcmpeqd xmm5,   xmm2
+    movmskps    r2d, xmm5
+    bsf     r1d,    r2d
+    add     r1d,    4
 
 WRITE_INDEX:
-    mov		[r5],	r1d
+    mov     [r5],   r1d
     POP_XMM
     LOAD_6_PARA_POP
     ret
@@ -158,66 +158,66 @@
 
 ;**********************************************************************************************************************************
 ;
-;	uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+;   uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
 ;
-;	\note:
-;		src and ref is optional to align with 16 due inter 8x8
-;	\return value:
-;		return minimal SAD cost, according index carried by index_min_cost
+;   \note:
+;       src and ref is optional to align with 16 due inter 8x8
+;   \return value:
+;       return minimal SAD cost, according index carried by index_min_cost
 ;
 ;**********************************************************************************************************************************
 ; try 8 mv via offset
 ; xmm7 store sad costs
-%macro   SAD_8x8_LINE_SSE41  4	; src, ref, stride_src, stride_ref
-    movdqu		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqa		xmm2, xmm1
+%macro SAD_8x8_LINE_SSE41  4    ; src, ref, stride_src, stride_ref
+    movdqu      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqa      xmm2, xmm1
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 5	; 101 B
-    paddw		xmm7, xmm2		; accumulate cost
+    mpsadbw     xmm2, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm2      ; accumulate cost
 
-    add			%1, %3
-    add			%2, %4
-%endmacro	; end of SAD_8x8_LINE_SSE41
-%macro   SAD_8x8_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
-    movdqu		xmm0, [%1]
-    movdqu		xmm1, [%2]
-    movdqa		xmm2, xmm1
+    add         %1, %3
+    add         %2, %4
+%endmacro   ; end of SAD_8x8_LINE_SSE41
+%macro SAD_8x8_LINE_SSE41E  4   ; src, ref, stride_src, stride_ref
+    movdqu      xmm0, [%1]
+    movdqu      xmm1, [%2]
+    movdqa      xmm2, xmm1
 
-    mpsadbw		xmm1, xmm0, 0	; 000 B
-    paddw		xmm7, xmm1		; accumulate cost
+    mpsadbw     xmm1, xmm0, 0   ; 000 B
+    paddw       xmm7, xmm1      ; accumulate cost
 
-    mpsadbw		xmm2, xmm0, 5	; 101 B
-    paddw		xmm7, xmm2		; accumulate cost
-%endmacro	; end of SAD_8x8_LINE_SSE41E
+    mpsadbw     xmm2, xmm0, 5   ; 101 B
+    paddw       xmm7, xmm2      ; accumulate cost
+%endmacro   ; end of SAD_8x8_LINE_SSE41E
 
 WELS_EXTERN SampleSad8x8Hor8_sse41
     %assign  push_num 0
     LOAD_6_PARA
     PUSH_XMM 8
-    SIGN_EXTENSION	r1, r1d
-    SIGN_EXTENSION	r3, r3d
-    movdqa xmm7, [r4]	;	load base cost list
+    SIGN_EXTENSION  r1, r1d
+    SIGN_EXTENSION  r3, r3d
+    movdqa xmm7, [r4]   ;   load base cost list
 
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
 
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
-    SAD_8x8_LINE_SSE41E	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41  r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41E r0, r2, r1, r3
 
-    phminposuw	xmm0, xmm7	; horizon search the minimal sad cost and its index
-    movd	retrd, xmm0	; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
-    mov		r1d, retrd
-    and		retrd, 0xFFFF
-    sar		r1d, 16
-    mov		[r5], r1d
+    phminposuw  xmm0, xmm7  ; horizon search the minimal sad cost and its index
+    movd    retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+    mov     r1d, retrd
+    and     retrd, 0xFFFF
+    sar     r1d, 16
+    mov     [r5], r1d
 
     POP_XMM
     LOAD_6_PARA_POP
--- a/codec/encoder/core/x86/score.asm
+++ b/codec/encoder/core/x86/score.asm
@@ -104,32 +104,32 @@
 
 align 16
 high_mask_table:
-	db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
-	db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
-	db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
-	db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
-	db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
-	db  5, 8, 5, 7, 8,11, 6, 8, 8,11
-	db  9,11,12,15, 0, 1, 1, 4, 1, 3
-	db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
-	db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
-	db  7,10, 8,10,11,14, 3, 4, 4, 7
-	db  5, 7, 8,11, 5, 7, 7,10, 8,10
-	db 11,14, 6, 7, 8,11, 8,10,11,14
-	db  9,11,11,14,12,14,15,18, 0, 0
-	db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
-	db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
-	db  7,10, 5, 7, 7,10, 8,10,11,14
-	db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
-	db  6, 9, 7, 9,10,13, 5, 6, 7,10
-	db  7, 9,10,13, 8,10,10,13,11,13
-	db 14,17, 3, 4, 4, 7, 4, 6, 7,10
-	db  5, 7, 7,10, 8,10,11,14, 5, 6
-	db  7,10, 7, 9,10,13, 8,10,10,13
-	db 11,13,14,17, 6, 7, 7,10, 8,10
-	db 11,14, 8,10,10,13,11,13,14,17
-	db  9,10,11,14,11,13,14,17,12,14
-	db 14,17,15,17,18,21
+    db  0, 0, 0, 3, 0, 2, 3, 6, 0, 2
+    db  2, 5, 3, 5, 6, 9, 0, 1, 2, 5
+    db  2, 4, 5, 8, 3, 5, 5, 8, 6, 8
+    db  9,12, 0, 1, 1, 4, 2, 4, 5, 8
+    db  2, 4, 4, 7, 5, 7, 8,11, 3, 4
+    db  5, 8, 5, 7, 8,11, 6, 8, 8,11
+    db  9,11,12,15, 0, 1, 1, 4, 1, 3
+    db  4, 7, 2, 4, 4, 7, 5, 7, 8,11
+    db  2, 3, 4, 7, 4, 6, 7,10, 5, 7
+    db  7,10, 8,10,11,14, 3, 4, 4, 7
+    db  5, 7, 8,11, 5, 7, 7,10, 8,10
+    db 11,14, 6, 7, 8,11, 8,10,11,14
+    db  9,11,11,14,12,14,15,18, 0, 0
+    db  1, 4, 1, 3, 4, 7, 1, 3, 3, 6
+    db  4, 6, 7,10, 2, 3, 4, 7, 4, 6
+    db  7,10, 5, 7, 7,10, 8,10,11,14
+    db  2, 3, 3, 6, 4, 6, 7,10, 4, 6
+    db  6, 9, 7, 9,10,13, 5, 6, 7,10
+    db  7, 9,10,13, 8,10,10,13,11,13
+    db 14,17, 3, 4, 4, 7, 4, 6, 7,10
+    db  5, 7, 7,10, 8,10,11,14, 5, 6
+    db  7,10, 7, 9,10,13, 8,10,10,13
+    db 11,13,14,17, 6, 7, 7,10, 8,10
+    db 11,14, 8,10,10,13,11,13,14,17
+    db  9,10,11,14,11,13,14,17,12,14
+    db 14,17,15,17,18,21
 
 align 16
 low_mask_table:
@@ -167,78 +167,78 @@
 ;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4DcAc_sse2
-	%ifdef X86_32
-	push r3
-	%assign push_num 1
-	%else
-	%assign push_num 0
-	%endif
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]			; 7 6 5 4 3 2 1 0
-	movdqa     xmm1, [r1+16]		; f e d c b a 9 8
-	pextrw     r2d, xmm0, 7			; ecx = 7
-	pextrw     r3d, xmm1, 2			; edx = a
-	pextrw     r1d, xmm0, 5			; eax = 5
-	pinsrw     xmm1, r2d, 2			; f e d c b 7 9 8
-	pinsrw     xmm0, r1d, 7			; 5 6 5 4 3 2 1 0
-	pextrw     r2d, xmm1, 0			; ecx = 8
-	pinsrw     xmm0, r2d, 5			; 5 6 8 4 3 2 1 0
-	pinsrw     xmm1, r3d, 0			; f e d c b 7 9 a
-	pshufd     xmm2, xmm0, 0xd8		; 5 6 3 2 8 4 1 0
-	pshufd     xmm3, xmm1, 0xd8		; f e b 7 d c 9 a
-	pshufhw    xmm0, xmm2, 0x93		; 6 3 2 5 8 4 1 0
-	pshuflw    xmm1, xmm3, 0x39		; f e b 7 a d c 9
-	movdqa     [r0],xmm0
-	movdqa     [r0+16], xmm1
-	%ifdef X86_32
-	pop r3
-	%endif
-	ret
+    %ifdef X86_32
+    push r3
+    %assign push_num 1
+    %else
+    %assign push_num 0
+    %endif
+    LOAD_2_PARA
+    movdqa     xmm0, [r1]           ; 7 6 5 4 3 2 1 0
+    movdqa     xmm1, [r1+16]        ; f e d c b a 9 8
+    pextrw     r2d, xmm0, 7         ; ecx = 7
+    pextrw     r3d, xmm1, 2         ; edx = a
+    pextrw     r1d, xmm0, 5         ; eax = 5
+    pinsrw     xmm1, r2d, 2         ; f e d c b 7 9 8
+    pinsrw     xmm0, r1d, 7         ; 5 6 5 4 3 2 1 0
+    pextrw     r2d, xmm1, 0         ; ecx = 8
+    pinsrw     xmm0, r2d, 5         ; 5 6 8 4 3 2 1 0
+    pinsrw     xmm1, r3d, 0         ; f e d c b 7 9 a
+    pshufd     xmm2, xmm0, 0xd8     ; 5 6 3 2 8 4 1 0
+    pshufd     xmm3, xmm1, 0xd8     ; f e b 7 d c 9 a
+    pshufhw    xmm0, xmm2, 0x93     ; 6 3 2 5 8 4 1 0
+    pshuflw    xmm1, xmm3, 0x39     ; f e b 7 a d c 9
+    movdqa     [r0],xmm0
+    movdqa     [r0+16], xmm1
+    %ifdef X86_32
+    pop r3
+    %endif
+    ret
 
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4DcAc_ssse3
-	%assign push_num 0
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]
-	movdqa     xmm1, [r1+16]
-	pextrw		r2d,  xmm0, 7			; ecx = [7]
-	pextrw		r1d,  xmm1, 0			; eax = [8]
-	pinsrw		xmm0, r1d, 7			; xmm0[7]	=	[8]
-	pinsrw		xmm1, r2d, 0			; xmm1[0]	=	[7]
-	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]
+    %assign push_num 0
+    LOAD_2_PARA
+    movdqa     xmm0, [r1]
+    movdqa     xmm1, [r1+16]
+    pextrw      r2d,  xmm0, 7           ; ecx = [7]
+    pextrw      r1d,  xmm1, 0           ; eax = [8]
+    pinsrw      xmm0, r1d, 7            ; xmm0[7]   =   [8]
+    pinsrw      xmm1, r2d, 0            ; xmm1[0]   =   [7]
+    pshufb      xmm1, [pb_scanacdc_maskb]
+    pshufb      xmm0, [pb_scanacdc_maska]
 
-	movdqa     [r0],xmm0
-	movdqa     [r0+16], xmm1
-	ret
+    movdqa     [r0],xmm0
+    movdqa     [r0+16], xmm1
+    ret
 ;***********************************************************************
 ;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
 ;***********************************************************************
 WELS_EXTERN WelsScan4x4Ac_sse2
-	%assign push_num 0
-	LOAD_2_PARA
-	movdqa     xmm0, [r1]
-	movdqa     xmm1, [r1+16]
-	movdqa     xmm2, xmm0
-	punpcklqdq xmm0, xmm1
-	punpckhqdq xmm2, xmm1
+    %assign push_num 0
+    LOAD_2_PARA
+    movdqa     xmm0, [r1]
+    movdqa     xmm1, [r1+16]
+    movdqa     xmm2, xmm0
+    punpcklqdq xmm0, xmm1
+    punpckhqdq xmm2, xmm1
 
-	movdqa     xmm3, xmm0
-	punpckldq  xmm0, xmm2
-	punpckhdq  xmm3, xmm2
-	pextrw     r1d , xmm0, 3
-	pextrw     r2d , xmm0, 7
-	pinsrw     xmm0, r1d,  7
-	pextrw     r1d,  xmm3, 4
-	pinsrw     xmm3, r2d,  4
-	pextrw     r2d,  xmm3, 0
-	pinsrw     xmm3, r1d,  0
-	pinsrw     xmm0, r2d,  3
+    movdqa     xmm3, xmm0
+    punpckldq  xmm0, xmm2
+    punpckhdq  xmm3, xmm2
+    pextrw     r1d , xmm0, 3
+    pextrw     r2d , xmm0, 7
+    pinsrw     xmm0, r1d,  7
+    pextrw     r1d,  xmm3, 4
+    pinsrw     xmm3, r2d,  4
+    pextrw     r2d,  xmm3, 0
+    pinsrw     xmm3, r1d,  0
+    pinsrw     xmm0, r2d,  3
 
-	pshufhw    xmm1, xmm0, 0x93
-	pshuflw    xmm2, xmm3, 0x39
+    pshufhw    xmm1, xmm0, 0x93
+    pshuflw    xmm2, xmm3, 0x39
 
     movdqa     xmm3, xmm2
     psrldq     xmm1, 2
@@ -245,9 +245,9 @@
     pslldq     xmm3, 14
     por        xmm1, xmm3
     psrldq     xmm2, 2
-	movdqa     [r0],xmm1
-	movdqa     [r0+16], xmm2
-	ret
+    movdqa     [r0],xmm1
+    movdqa     [r0+16], xmm2
+    ret
 
 
 ;***********************************************************************
@@ -254,19 +254,19 @@
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
 WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
-	%ifdef X86_32
-	push r3
-	%assign push_num 1
-	%else
-	%assign push_num 0
-	%endif
-	LOAD_1_PARA
-	movdqa    xmm0, [r0]
-	movdqa    xmm1, [r0+16]
+    %ifdef X86_32
+    push r3
+    %assign push_num 1
+    %else
+    %assign push_num 0
+    %endif
+    LOAD_1_PARA
+    movdqa    xmm0, [r0]
+    movdqa    xmm1, [r0+16]
 
-	packsswb  xmm0, xmm1
-	; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
-	xor r3, r3
+    packsswb  xmm0, xmm1
+    ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+    xor r3, r3
     pxor      xmm3, xmm3
     pcmpeqb   xmm0, xmm3
     pmovmskb  r3d,  xmm0
@@ -273,39 +273,39 @@
 
     xor       r3,  0xffff
 
-	xor       r0,  r0
-	mov       r2,  7
-	mov       r1,  8
+    xor       r0,  r0
+    mov       r2,  7
+    mov       r1,  8
 .loop_low8_find1:
-	bt        r3,  r2
-	jc        .loop_high8_find1
-	dec		  r2
-	jnz      .loop_low8_find1
+    bt        r3,  r2
+    jc        .loop_high8_find1
+    dec       r2
+    jnz      .loop_low8_find1
 .loop_high8_find1:
-	bt        r3, r1
-	jc        .find1end
-	inc       r1
-	cmp       r1,16
-	jb        .loop_high8_find1
+    bt        r3, r1
+    jc        .find1end
+    inc       r1
+    cmp       r1,16
+    jb        .loop_high8_find1
 .find1end:
-	sub       r1, r2
-	sub       r1, 1
-	lea	  r2,  [i_ds_table]
-	add       r0b,  [r2+r1]
-	mov       r1, r3
-	and       r3, 0xff
-	shr       r1, 8
-	and       r1, 0xff
-	lea	  r2 , [low_mask_table]
-	add       r0b,  [r2 +r3]
-	lea	  r2, [high_mask_table]
-	add       r0b,  [r2+r1]
-	%ifdef X86_32
-	pop r3
-	%else
-	mov retrd, r0d
-	%endif
-	ret
+    sub       r1, r2
+    sub       r1, 1
+    lea   r2,  [i_ds_table]
+    add       r0b,  [r2+r1]
+    mov       r1, r3
+    and       r3, 0xff
+    shr       r1, 8
+    and       r1, 0xff
+    lea   r2 , [low_mask_table]
+    add       r0b,  [r2 +r3]
+    lea   r2, [high_mask_table]
+    add       r0b,  [r2+r1]
+    %ifdef X86_32
+    pop r3
+    %else
+    mov retrd, r0d
+    %endif
+    ret
 
 
 ;***********************************************************************
@@ -312,28 +312,28 @@
 ; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
 ;***********************************************************************
 WELS_EXTERN WelsGetNoneZeroCount_sse2
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa    xmm0, [r0]
-	movdqa    xmm1, [r0+16]
-	pxor      xmm2, xmm2
-	pcmpeqw   xmm0, xmm2
-	pcmpeqw   xmm1, xmm2
-	packsswb  xmm1, xmm0
-	xor r1, r1
-	pmovmskb  r1d,  xmm1
-	xor       r1d,  0xffff
-	mov       r2,  r1
-	and       r1,  0xff
-	shr       r2,  8
-;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-;	xor       retr,  retr
-	;add       al,  [nozero_count_table+r2]
-	lea 	  r0 , [nozero_count_table]
-	movzx	  r2, byte [r0+r2]
-	movzx	  r1,   byte [r0+r1]
-	mov	  retrq, r2
-	add	  retrq, r1
-	;add       al,  [nozero_count_table+r1]
-	ret
+    %assign push_num 0
+    LOAD_1_PARA
+    movdqa    xmm0, [r0]
+    movdqa    xmm1, [r0+16]
+    pxor      xmm2, xmm2
+    pcmpeqw   xmm0, xmm2
+    pcmpeqw   xmm1, xmm2
+    packsswb  xmm1, xmm0
+    xor r1, r1
+    pmovmskb  r1d,  xmm1
+    xor       r1d,  0xffff
+    mov       r2,  r1
+    and       r1,  0xff
+    shr       r2,  8
+;   and       ecx,  0xff    ; we do not need this due to high 16bits equal to 0 yet
+;   xor       retr,  retr
+    ;add       al,  [nozero_count_table+r2]
+    lea       r0 , [nozero_count_table]
+    movzx     r2, byte [r0+r2]
+    movzx     r1,   byte [r0+r1]
+    mov   retrq, r2
+    add   retrq, r1
+    ;add       al,  [nozero_count_table+r1]
+    ret
 
--- a/codec/processing/src/arm/adaptive_quantization.S
+++ b/codec/processing/src/arm/adaptive_quantization.S
@@ -36,17 +36,17 @@
 
 #ifdef __APPLE__
 .macro SQR_ADD_16BYTES
-	vmull.u8 q3, $0, $0
-	vmull.u8 q8, $1, $1
-	vpadal.u16 $2, q3
-	vpadal.u16 $2, q8
+    vmull.u8 q3, $0, $0
+    vmull.u8 q8, $1, $1
+    vpadal.u16 $2, q3
+    vpadal.u16 $2, q8
 .endm
 #else
 .macro SQR_ADD_16BYTES arg0, arg1, arg2
-	vmull.u8 q3, \arg0, \arg0
-	vmull.u8 q8, \arg1, \arg1
-	vpadal.u16 \arg2, q3
-	vpadal.u16 \arg2, q8
+    vmull.u8 q3, \arg0, \arg0
+    vmull.u8 q8, \arg1, \arg1
+    vpadal.u16 \arg2, q3
+    vpadal.u16 \arg2, q8
 .endm
 #endif
 
@@ -54,66 +54,66 @@
 WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
     stmdb sp!, {r4}
 
-	vld1.8   {q15}, [r0], r1 //save the ref data (16bytes)
-	vld1.8   {q14}, [r2], r3 //save the src data (16bytes)
+    vld1.8   {q15}, [r0], r1 //save the ref data (16bytes)
+    vld1.8   {q14}, [r2], r3 //save the src data (16bytes)
 
 
-	vabd.u8  q13, q14, q15
-	vmull.u8 q12, d27, d27
-	vmull.u8 q11, d26, d26
-	vaddl.u16 q12, d24, d25
-	vpadal.u16 q12, q11     //sqr
+    vabd.u8  q13, q14, q15
+    vmull.u8 q12, d27, d27
+    vmull.u8 q11, d26, d26
+    vaddl.u16 q12, d24, d25
+    vpadal.u16 q12, q11     //sqr
 
     vaddl.u8 q13, d26, d27 //sum
 
-	vaddl.u8 q10, d28, d29 //sum_cur
+    vaddl.u8 q10, d28, d29 //sum_cur
 
-	vmull.u8 q9,  d29, d29
-	vmull.u8 q8,  d28, d28
-	vaddl.u16 q9, d18, d19       //sqr_cur
-	vpadal.u16 q9, q8
+    vmull.u8 q9,  d29, d29
+    vmull.u8 q8,  d28, d28
+    vaddl.u16 q9, d18, d19       //sqr_cur
+    vpadal.u16 q9, q8
 
-	mov r4, #15
+    mov r4, #15
 pixel_var_16x16_loop0:
 
-	vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
-	vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
+    vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
+    vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
 
-	vabd.u8 q2, q0, q1
+    vabd.u8 q2, q0, q1
 
-	//q10 save sum_cur
-	vpadal.u8 q10, q1
+    //q10 save sum_cur
+    vpadal.u8 q10, q1
 
-	//q12 save sqr
-	SQR_ADD_16BYTES d4, d5, q12
+    //q12 save sqr
+    SQR_ADD_16BYTES d4, d5, q12
 
     //q13 save sum
-	vpadal.u8 q13, q2
+    vpadal.u8 q13, q2
 
-	subs r4, #1
+    subs r4, #1
 
-	//q9 save sqr_cur
-	SQR_ADD_16BYTES d2, d3, q9
+    //q9 save sqr_cur
+    SQR_ADD_16BYTES d2, d3, q9
 
-	bne pixel_var_16x16_loop0
+    bne pixel_var_16x16_loop0
 
-	vadd.u16 d0, d26, d27 //sum
-	vadd.u16 d1, d20, d21 //sum_cur
-	vpaddl.u16 q0, q0
-	vadd.u32 d2, d24, d25 //sqr
-	vadd.u32 d3, d18, d19 //sqr_cur
-	vpadd.u32 d0, d0, d1
-	vpadd.u32 d1, d2, d3
+    vadd.u16 d0, d26, d27 //sum
+    vadd.u16 d1, d20, d21 //sum_cur
+    vpaddl.u16 q0, q0
+    vadd.u32 d2, d24, d25 //sqr
+    vadd.u32 d3, d18, d19 //sqr_cur
+    vpadd.u32 d0, d0, d1
+    vpadd.u32 d1, d2, d3
 
-	ldr       r4, [sp, #4]
+    ldr       r4, [sp, #4]
 
-	vshr.u32  q0, q0, #8
-	vmul.u32  d0, d0
-	vsub.u32  d0, d1, d0
+    vshr.u32  q0, q0, #8
+    vmul.u32  d0, d0
+    vsub.u32  d0, d1, d0
     vmovl.u32 q0, d0
-	vst2.16  {d0[0], d1[0]}, [r4]
+    vst2.16  {d0[0], d1[0]}, [r4]
 
-	ldmia sp!, {r4}
+    ldmia sp!, {r4}
 
 WELS_ASM_FUNC_END
 
--- a/codec/processing/src/arm/down_sample_neon.S
+++ b/codec/processing/src/arm/down_sample_neon.S
@@ -30,196 +30,196 @@
  *
  */
 
-#ifdef	HAVE_NEON
+#ifdef  HAVE_NEON
 .text
 #include "arm_arch_common_macro.S"
 
 
-WELS_ASM_FUNC_BEGIN	DyadicBilinearDownsampler_neon
-	stmdb	sp!, {r4-r8, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
+    stmdb   sp!, {r4-r8, lr}
 
-	//Get	the	width	and	height
-	ldr	 r4, [sp,	#24]	//src_width
-	ldr	 r5, [sp,	#28]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #24]    //src_width
+    ldr  r5, [sp,   #28]    //src_height
 
-	//Initialize the register
-	mov	r6,	r2
-	mov	r8,	r0
-	mov	lr,	#0
-	lsr	r5,	#1
+    //Initialize the register
+    mov r6, r2
+    mov r8, r0
+    mov lr, #0
+    lsr r5, #1
 
-	//Save the tailer	for	the	unasigned	size
-	mla	 r7, r1, r5, r0
-	vld1.32	{q15}, [r7]
+    //Save the tailer   for the unasigned   size
+    mla  r7, r1, r5, r0
+    vld1.32 {q15}, [r7]
 
-	add	r7,	r2,	r3
-	//processing a colume	data
+    add r7, r2, r3
+    //processing a colume   data
 comp_ds_bilinear_loop0:
 
-	vld1.8 {q0,q1},	[r2]!
-	vld1.8 {q2,q3},	[r7]!
-	vpaddl.u8	q0,	q0
-	vpaddl.u8	q1,	q1
-	vpaddl.u8	q2,	q2
-	vpaddl.u8	q3,	q3
-	vrshr.u16	q0,	#1
-	vrshr.u16	q1,	#1
-	vrshr.u16	q2,	#1
-	vrshr.u16	q3,	#1
-	vrhadd.u16 q0, q2
-	vrhadd.u16 q1, q3
-	vmovn.u16	d0,	q0
-	vmovn.u16	d1,	q1
-	vst1.32	{q0},	[r0]!
-	add	lr,	#32
+    vld1.8 {q0,q1}, [r2]!
+    vld1.8 {q2,q3}, [r7]!
+    vpaddl.u8   q0, q0
+    vpaddl.u8   q1, q1
+    vpaddl.u8   q2, q2
+    vpaddl.u8   q3, q3
+    vrshr.u16   q0, #1
+    vrshr.u16   q1, #1
+    vrshr.u16   q2, #1
+    vrshr.u16   q3, #1
+    vrhadd.u16 q0, q2
+    vrhadd.u16 q1, q3
+    vmovn.u16   d0, q0
+    vmovn.u16   d1, q1
+    vst1.32 {q0},   [r0]!
+    add lr, #32
 
-	cmp	lr,	r4
-	movcs	lr,	#0
-	addcs	r6,	r6,	r3,	lsl	#1
-	movcs	r2,	r6
-	addcs	r7,	r2,	r3
-	addcs	r8,	r1
-	movcs	r0,	r8
-	subscs r5, #1
-	bne	comp_ds_bilinear_loop0
+    cmp lr, r4
+    movcs   lr, #0
+    addcs   r6, r6, r3, lsl #1
+    movcs   r2, r6
+    addcs   r7, r2, r3
+    addcs   r8, r1
+    movcs   r0, r8
+    subscs r5, #1
+    bne comp_ds_bilinear_loop0
 
-	//restore	the	tailer for the unasigned size
-	vst1.32	{q15}, [r0]
+    //restore   the tailer for the unasigned size
+    vst1.32 {q15}, [r0]
 
-	ldmia	sp!, {r4-r8,lr}
+    ldmia   sp!, {r4-r8,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	comp_ds_bilinear_w_x8_neon
-    stmdb	sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
+    stmdb   sp!, {r4-r7, lr}
 
-    //Get	the	width	and	height
-	ldr	 r4, [sp,	#20]	//src_width
-	ldr	 r5, [sp,	#24]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #20]    //src_width
+    ldr  r5, [sp,   #24]    //src_height
 
-	//Get	the	difference
-	sub	lr,	r3,	r4
-	sub	r1,	r1,	r4,	lsr	#1
+    //Get   the difference
+    sub lr, r3, r4
+    sub r1, r1, r4, lsr #1
 
-	lsr	r5,	#1
+    lsr r5, #1
 
-	//processing a colume	data
+    //processing a colume   data
 comp_ds_bilinear_w_x8_loop0:
 
-	lsr	r6,	r4,	#3
-	add	r7,	r2,	r3
-	//processing a line	data
+    lsr r6, r4, #3
+    add r7, r2, r3
+    //processing a line data
 comp_ds_bilinear_w_x8_loop1:
 
-	vld1.8 {d0}, [r2]!
-	vld1.8 {d1}, [r7]!
-	vpaddl.u8	q0,	q0
-	vrshr.u16	q0,	#1
-	vrhadd.u16 d0, d1
+    vld1.8 {d0}, [r2]!
+    vld1.8 {d1}, [r7]!
+    vpaddl.u8   q0, q0
+    vrshr.u16   q0, #1
+    vrhadd.u16 d0, d1
 
-	vmovn.u16	d0,	q0
-	vst1.32	{d0[0]}, [r0]!
-	subs r6, #1
-	bne	comp_ds_bilinear_w_x8_loop1
+    vmovn.u16   d0, q0
+    vst1.32 {d0[0]}, [r0]!
+    subs r6, #1
+    bne comp_ds_bilinear_w_x8_loop1
 
-	add	r2,	r7,	lr
-	add	r0,	r1
-	subs r5, #1
-	bne	comp_ds_bilinear_w_x8_loop0
+    add r2, r7, lr
+    add r0, r1
+    subs r5, #1
+    bne comp_ds_bilinear_w_x8_loop0
 
-    ldmia	sp!, {r4-r7,lr}
+    ldmia   sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	comp_ds_bilinear_w_x16_neon
-    stmdb	sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
+    stmdb   sp!, {r4-r7, lr}
 
-    //Get	the	width	and	height
-	ldr	 r4, [sp,	#20]	//src_width
-	ldr	 r5, [sp,	#24]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #20]    //src_width
+    ldr  r5, [sp,   #24]    //src_height
 
-	//Get	the	difference
-	sub	lr,	r3,	r4
-	sub	r1,	r1,	r4,	lsr	#1
+    //Get   the difference
+    sub lr, r3, r4
+    sub r1, r1, r4, lsr #1
 
-	lsr	r5,	#1
+    lsr r5, #1
 
-	//processing a colume	data
+    //processing a colume   data
 comp_ds_bilinear_w_x16_loop0:
 
-	lsr	r6,	r4,	#4
-	add	r7,	r2,	r3
-	//processing a line	data
+    lsr r6, r4, #4
+    add r7, r2, r3
+    //processing a line data
 comp_ds_bilinear_w_x16_loop1:
 
-	vld1.8 {q0}, [r2]!
-	vld1.8 {q1}, [r7]!
-	vpaddl.u8	q0,	q0
-	vpaddl.u8	q1,	q1
-	vrshr.u16	q0,	#1
-	vrshr.u16	q1,	#1
-	vrhadd.u16 q0, q1
+    vld1.8 {q0}, [r2]!
+    vld1.8 {q1}, [r7]!
+    vpaddl.u8   q0, q0
+    vpaddl.u8   q1, q1
+    vrshr.u16   q0, #1
+    vrshr.u16   q1, #1
+    vrhadd.u16 q0, q1
 
-	vmovn.u16	d0,	q0
-	vst1.32	{d0},	[r0]!
-	subs r6, #1
-	bne	comp_ds_bilinear_w_x16_loop1
+    vmovn.u16   d0, q0
+    vst1.32 {d0},   [r0]!
+    subs r6, #1
+    bne comp_ds_bilinear_w_x16_loop1
 
-	add	r2,	r7,	lr
-	add	r0,	r1
-	subs r5, #1
-	bne	comp_ds_bilinear_w_x16_loop0
+    add r2, r7, lr
+    add r0, r1
+    subs r5, #1
+    bne comp_ds_bilinear_w_x16_loop0
 
-	ldmia	sp!, {r4-r7,lr}
+    ldmia   sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
-WELS_ASM_FUNC_BEGIN	DyadicBilinearDownsamplerWidthx32_neon
-	stmdb	sp!, {r4-r7, lr}
+WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
+    stmdb   sp!, {r4-r7, lr}
 
-	//Get	the	width	and	height
-	ldr	 r4, [sp,	#20]	//src_width
-	ldr	 r5, [sp,	#24]	//src_height
+    //Get   the width   and height
+    ldr  r4, [sp,   #20]    //src_width
+    ldr  r5, [sp,   #24]    //src_height
 
-	//Get	the	difference
-	sub	lr,	r3,	r4
-	sub	r1,	r1,	r4,	lsr	#1
+    //Get   the difference
+    sub lr, r3, r4
+    sub r1, r1, r4, lsr #1
 
-	lsr	r5,	#1
+    lsr r5, #1
 
-	//processing a colume	data
+    //processing a colume   data
 comp_ds_bilinear_w_x32_loop0:
 
-	lsr	r6,	r4,	#5
-	add	r7,	r2,	r3
-	//processing a line	data
+    lsr r6, r4, #5
+    add r7, r2, r3
+    //processing a line data
 comp_ds_bilinear_w_x32_loop1:
 
-	vld1.8 {q0,q1},	[r2]!
-	vld1.8 {q2,q3},	[r7]!
-	vpaddl.u8	q0,	q0
-	vpaddl.u8	q1,	q1
-	vpaddl.u8	q2,	q2
-	vpaddl.u8	q3,	q3
-	vrshr.u16	q0,	#1
-	vrshr.u16	q1,	#1
-	vrshr.u16	q2,	#1
-	vrshr.u16	q3,	#1
-	vrhadd.u16 q0, q2
-	vrhadd.u16 q1, q3
+    vld1.8 {q0,q1}, [r2]!
+    vld1.8 {q2,q3}, [r7]!
+    vpaddl.u8   q0, q0
+    vpaddl.u8   q1, q1
+    vpaddl.u8   q2, q2
+    vpaddl.u8   q3, q3
+    vrshr.u16   q0, #1
+    vrshr.u16   q1, #1
+    vrshr.u16   q2, #1
+    vrshr.u16   q3, #1
+    vrhadd.u16 q0, q2
+    vrhadd.u16 q1, q3
 
-	vmovn.u16	d0,	q0
-	vmovn.u16	d1,	q1
-	vst1.32	{q0},	[r0]!
-	subs r6, #1
-	bne	comp_ds_bilinear_w_x32_loop1
+    vmovn.u16   d0, q0
+    vmovn.u16   d1, q1
+    vst1.32 {q0},   [r0]!
+    subs r6, #1
+    bne comp_ds_bilinear_w_x32_loop1
 
-	add	r2,	r7,	lr
-	add	r0,	r1
-	subs r5, #1
-	bne	comp_ds_bilinear_w_x32_loop0
+    add r2, r7, lr
+    add r0, r1
+    subs r5, #1
+    bne comp_ds_bilinear_w_x32_loop0
 
-	ldmia	sp!, {r4-r7,lr}
+    ldmia   sp!, {r4-r7,lr}
 WELS_ASM_FUNC_END
 
 
@@ -226,117 +226,117 @@
 WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
     stmdb sp!, {r4-r12, lr}
 
-	//Get the data from stack
-	ldr r4, [sp, #40] //the addr of src
-	ldr r5, [sp, #44] //the value of src_stride
+    //Get the data from stack
+    ldr r4, [sp, #40] //the addr of src
+    ldr r5, [sp, #44] //the value of src_stride
     ldr r6, [sp, #48] //the value of scaleX
     ldr r7, [sp, #52] //the value of scaleY
 
     mov     r10, #32768
     sub     r10, #1
-    and		r8, r6, r10			// r8 uinc(scaleX mod 32767)
+    and     r8, r6, r10         // r8 uinc(scaleX mod 32767)
     mov     r11, #-1
-	mul		r11, r8			// r11 -uinc
+    mul     r11, r8         // r11 -uinc
 
     vdup.s16 d2, r8
     vdup.s16 d0, r11
     vzip.s16 d0, d2         // uinc -uinc uinc -uinc
 
-	and		r9, r7, r10			// r9 vinc(scaleY mod 32767)
+    and     r9, r7, r10         // r9 vinc(scaleY mod 32767)
     mov     r11, #-1
-	mul		r11, r9			// r11 -vinc
+    mul     r11, r9         // r11 -vinc
 
-	vdup.s16 d2, r9
-	vdup.s16 d3, r11
-	vext.8   d5, d3, d2, #4		// vinc vinc -vinc -vinc
+    vdup.s16 d2, r9
+    vdup.s16 d3, r11
+    vext.8   d5, d3, d2, #4     // vinc vinc -vinc -vinc
 
-    mov		 r11, #0x40000000
+    mov      r11, #0x40000000
     mov      r12, #0x4000
     sub      r12, #1
     add      r11, r12
-	vdup.s32 d1, r11;			//init u  16384 16383 16384 16383
+    vdup.s32 d1, r11;           //init u  16384 16383 16384 16383
 
-	mov		 r11, #16384
+    mov      r11, #16384
     vdup.s16 d16, r11
     sub      r11, #1
-	vdup.s16 d17, r11
-	vext.8	 d7, d17, d16, #4		//init v  16384 16384 16383 16383
+    vdup.s16 d17, r11
+    vext.8   d7, d17, d16, #4       //init v  16384 16384 16383 16383
 
-	veor    q14,     q14
-	sub		r1,		r2			// stride - width
-	mov		r8,		#16384		// yInverse
-	sub		r3,		#1
+    veor    q14,     q14
+    sub     r1,     r2          // stride - width
+    mov     r8,     #16384      // yInverse
+    sub     r3,     #1
 
 _HEIGHT:
     ldr     r4, [sp, #40]           //the addr of src
-    mov		r11,	r8
-    lsr		r11,	#15
-	mul		r11,	r5
-	add		r11,	r4					// get current row address
-	mov		r12,	r11
-	add		r12,	r5
+    mov     r11,    r8
+    lsr     r11,    #15
+    mul     r11,    r5
+    add     r11,    r4                  // get current row address
+    mov     r12,    r11
+    add     r12,    r5
 
-	mov		r9,		#16384				// xInverse
-	sub		r10, r2, #1
+    mov     r9,     #16384              // xInverse
+    sub     r10, r2, #1
     vmov.s16 d6, d1
 
 _WIDTH:
-	mov		lr,		r9
-    lsr		lr,		#15
+    mov     lr,     r9
+    lsr     lr,     #15
     add     r4,     r11,lr
-	vld2.8	{d28[0],d29[0]},	[r4]		//q14: 0000000b0000000a;
+    vld2.8  {d28[0],d29[0]},    [r4]        //q14: 0000000b0000000a;
     add     r4,     r12,lr
-	vld2.8	{d28[4],d29[4]},	[r4]		//q14: 000d000b000c000a;
-	vzip.32		d28, d29					//q14: 000d000c000b000a;
+    vld2.8  {d28[4],d29[4]},    [r4]        //q14: 000d000b000c000a;
+    vzip.32     d28, d29                    //q14: 000d000c000b000a;
 
-	vmull.u16	q13, d6, d7			//q13: init u  *  init  v
-	vmull.u32	q12, d26,d28
-	vmlal.u32	q12, d27,d29
-	vqadd.u64	d24, d24,d25
-	vrshr.u64	d24, #30
+    vmull.u16   q13, d6, d7         //q13: init u  *  init  v
+    vmull.u32   q12, d26,d28
+    vmlal.u32   q12, d27,d29
+    vqadd.u64   d24, d24,d25
+    vrshr.u64   d24, #30
 
-	vst1.8	{d24[0]},	[r0]!
-	add		r9,	r6
-	vadd.u16	d6, d0				// inc u
-	vshl.u16	d6, #1
-	vshr.u16	d6, #1
-	subs	r10, #1
-	bne		_WIDTH
+    vst1.8  {d24[0]},   [r0]!
+    add     r9, r6
+    vadd.u16    d6, d0              // inc u
+    vshl.u16    d6, #1
+    vshr.u16    d6, #1
+    subs    r10, #1
+    bne     _WIDTH
 
 WIDTH_END:
-    lsr		r9,		#15
+    lsr     r9,     #15
     add     r4,r11,r9
-	vld1.8	{d24[0]},	[r4]
-	vst1.8	{d24[0]},   [r0]
-	add		r0,		#1
-	add		r8,		r7
-	add		r0,		r1
-	vadd.s16	d7,	d5				// inc v
-	vshl.u16	d7, #1
-	vshr.u16	d7, #1
-	subs	r3,		#1
-	bne		_HEIGHT
+    vld1.8  {d24[0]},   [r4]
+    vst1.8  {d24[0]},   [r0]
+    add     r0,     #1
+    add     r8,     r7
+    add     r0,     r1
+    vadd.s16    d7, d5              // inc v
+    vshl.u16    d7, #1
+    vshr.u16    d7, #1
+    subs    r3,     #1
+    bne     _HEIGHT
 
 LAST_ROW:
     ldr     r4, [sp, #40]           //the addr of src
-    lsr		r8,	#15
-	mul		r8, r5
-	add		r4,	r8					// get current row address
-	mov		r9,		#16384
+    lsr     r8, #15
+    mul     r8, r5
+    add     r4, r8                  // get current row address
+    mov     r9,     #16384
 
 _LAST_ROW_WIDTH:
-	mov		r11,	r9
-    lsr		r11,	#15
+    mov     r11,    r9
+    lsr     r11,    #15
 
-	add     r3,     r4,r11
-	vld1.8	{d0[0]},	[r3]
-	vst1.8	{d0[0]},	[r0]
-	add		r0,		#1
-	add		r9,		r6
-	subs	r2,		#1
-	bne		_LAST_ROW_WIDTH
+    add     r3,     r4,r11
+    vld1.8  {d0[0]},    [r3]
+    vst1.8  {d0[0]},    [r0]
+    add     r0,     #1
+    add     r9,     r6
+    subs    r2,     #1
+    bne     _LAST_ROW_WIDTH
 
-	ldmia sp!, {r4-r12, lr}
+    ldmia sp!, {r4-r12, lr}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/processing/src/arm/pixel_sad_neon.S
+++ b/codec/processing/src/arm/pixel_sad_neon.S
@@ -37,32 +37,32 @@
 
 WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
     stmdb sp!, {lr}
-	//Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1
-	vld1.8 {d1}, [r2], r3
+    //Loading a horizontal line data (8 bytes)
+    vld1.8 {d0}, [r0], r1
+    vld1.8 {d1}, [r2], r3
 
-	//Do the SAD for 8 bytes
-	vabdl.u8  q1, d0, d1
+    //Do the SAD for 8 bytes
+    vabdl.u8  q1, d0, d1
 
-	mov lr, #7
+    mov lr, #7
 pixel_sad_8x8_loop0:
 
     //Loading a horizontal line data (8 bytes)
-	vld1.8 {d0}, [r0], r1
-	vld1.8 {d1}, [r2], r3
+    vld1.8 {d0}, [r0], r1
+    vld1.8 {d1}, [r2], r3
 
-	subs lr, #1
+    subs lr, #1
 
-	//Do the SAD for 8 bytes
-	vabal.u8  q1, d0, d1
-	bne pixel_sad_8x8_loop0
+    //Do the SAD for 8 bytes
+    vabal.u8  q1, d0, d1
+    bne pixel_sad_8x8_loop0
 
-	vadd.u16   d2, d3
-	vpaddl.u16 d2, d2
-	vpaddl.u32 d2, d2
-	vmov.u32   r0, d2[0]//TBO...
+    vadd.u16   d2, d3
+    vpaddl.u16 d2, d2
+    vpaddl.u32 d2, d2
+    vmov.u32   r0, d2[0]//TBO...
 
-	ldmia sp!, {lr}
+    ldmia sp!, {lr}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/processing/src/arm/vaa_calc_neon.S
+++ b/codec/processing/src/arm/vaa_calc_neon.S
@@ -37,61 +37,61 @@
 #ifdef __APPLE__
 
 .macro ABS_SUB_SUM_16BYTES
-	vld1.32 {q15}, [$0], $2
-	vld1.32 {q14}, [$1], $2
-	vabal.u8 $3, d30, d28
-	vabal.u8 $4, d31, d29
+    vld1.32 {q15}, [$0], $2
+    vld1.32 {q14}, [$1], $2
+    vabal.u8 $3, d30, d28
+    vabal.u8 $4, d31, d29
 .endm
 
 .macro ABS_SUB_SUM_8x16BYTES
-	vld1.32 {q15}, [$0], $2
-	vld1.32 {q14}, [$1], $2
-	vabdl.u8 $3, d30, d28
-	vabdl.u8 $4, d31, d29
+    vld1.32 {q15}, [$0], $2
+    vld1.32 {q14}, [$1], $2
+    vabdl.u8 $3, d30, d28
+    vabdl.u8 $4, d31, d29
 
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
-	ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
+    ABS_SUB_SUM_16BYTES $0, $1, $2, $3, $4
 .endm
 
 .macro SAD_8X16BITS
-	vadd.u16 d31, $0, $1
-	vpaddl.u16 d31, d31
-	vpaddl.u32 $2, d31
+    vadd.u16 d31, $0, $1
+    vpaddl.u16 d31, d31
+    vpaddl.u32 $2, d31
 .endm
 
 #else
 
 .macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
-	vld1.32 {q15}, [\arg0], \arg2
-	vld1.32 {q14}, [\arg1], \arg2
-	vabal.u8 \arg3, d30, d28
-	vabal.u8 \arg4, d31, d29
+    vld1.32 {q15}, [\arg0], \arg2
+    vld1.32 {q14}, [\arg1], \arg2
+    vabal.u8 \arg3, d30, d28
+    vabal.u8 \arg4, d31, d29
 .endm
 
 .macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
-	vld1.32 {q15}, [\arg0], \arg2
-	vld1.32 {q14}, [\arg1], \arg2
-	vabdl.u8 \arg3, d30, d28
-	vabdl.u8 \arg4, d31, d29
+    vld1.32 {q15}, [\arg0], \arg2
+    vld1.32 {q14}, [\arg1], \arg2
+    vabdl.u8 \arg3, d30, d28
+    vabdl.u8 \arg4, d31, d29
 
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
-	ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
+    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
 .endm
 
 .macro SAD_8X16BITS arg0, arg1, arg2
-	vadd.u16 d31, \arg0, \arg1
-	vpaddl.u16 d31, d31
-	vpaddl.u32 \arg2, d31
+    vadd.u16 d31, \arg0, \arg1
+    vpaddl.u16 d31, d31
+    vpaddl.u32 \arg2, d31
 .endm
 #endif
 
@@ -100,16 +100,16 @@
 
     stmdb sp!, {r4-r8}
 
-	ldr r4, [sp, #20] //load pic_stride
-	ldr r5, [sp, #28] //load psad8x8
+    ldr r4, [sp, #20] //load pic_stride
+    ldr r5, [sp, #28] //load psad8x8
 
-	//Initial the Q8 register for save the "psadframe"
-	vmov.s64 q8, #0
+    //Initial the Q8 register for save the "psadframe"
+    vmov.s64 q8, #0
 
-	//Get the jump distance to use on loop codes
-	lsl r8, r4, #4
-	sub r7, r8, #16 //R7 keep the 16*pic_stride-16
-	sub r8, r2      //R8 keep the 16*pic_stride-pic_width
+    //Get the jump distance to use on loop codes
+    lsl r8, r4, #4
+    sub r7, r8, #16 //R7 keep the 16*pic_stride-16
+    sub r8, r2      //R8 keep the 16*pic_stride-pic_width
 
 vaa_calc_sad_loop0:
 
@@ -118,70 +118,70 @@
 
 vaa_calc_sad_loop1:
 
-	//Process the 16x16 bytes
-	ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
-	ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
+    //Process the 16x16 bytes
+    ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
+    ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
 
-	//Do the SAD
-	SAD_8X16BITS d0, d1, d0
-	SAD_8X16BITS d2, d3, d1
-	SAD_8X16BITS d4, d5, d2
-	SAD_8X16BITS d6, d7, d3
+    //Do the SAD
+    SAD_8X16BITS d0, d1, d0
+    SAD_8X16BITS d2, d3, d1
+    SAD_8X16BITS d4, d5, d2
+    SAD_8X16BITS d6, d7, d3
 
-	//Write to "psad8x8" buffer
-	vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
+    //Write to "psad8x8" buffer
+    vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
 
 
-	//Adjust the input address
-	sub r0, r7
-	sub r1, r7
+    //Adjust the input address
+    sub r0, r7
+    sub r1, r7
 
-	subs r6, #16
+    subs r6, #16
 
-	//Save to calculate "psadframe"
-	vadd.u32 q0, q1
-	vadd.u32 q8, q0
+    //Save to calculate "psadframe"
+    vadd.u32 q0, q1
+    vadd.u32 q8, q0
 
-	bne vaa_calc_sad_loop1
+    bne vaa_calc_sad_loop1
 
-	//Adjust the input address
-	add r0, r8
-	add r1, r8
+    //Adjust the input address
+    add r0, r8
+    add r1, r8
 
     subs r3, #16
-	bne vaa_calc_sad_loop0
+    bne vaa_calc_sad_loop0
 
-	ldr r6, [sp, #24] //load psadframe
-	vadd.u32 d16, d17
-	vst1.32 {d16[0]}, [r6]
+    ldr r6, [sp, #24] //load psadframe
+    vadd.u32 d16, d17
+    vst1.32 {d16[0]}, [r6]
 
-	ldmia sp!, {r4-r8}
+    ldmia sp!, {r4-r8}
 
 WELS_ASM_FUNC_END
 
 
 #ifdef __APPLE__
-.macro  SAD_SD_MAD_16BYTES
-	vld1.32 {q0}, [$0], $2
-	vld1.32 {q1}, [$1], $2
+.macro SAD_SD_MAD_16BYTES
+    vld1.32 {q0}, [$0], $2
+    vld1.32 {q1}, [$1], $2
 
-	vpadal.u8 $3, q0
-	vpadal.u8 $4, q1
+    vpadal.u8 $3, q0
+    vpadal.u8 $4, q1
 
-	vabd.u8 q0, q0, q1
-	vmax.u8 $5, q0
-	vpadal.u8 $6, q0
+    vabd.u8 q0, q0, q1
+    vmax.u8 $5, q0
+    vpadal.u8 $6, q0
 .endm
 
-.macro  SAD_SD_MAD_8x16BYTES
-	vld1.32 {q0}, [$0], $2
-	vld1.32 {q1}, [$1], $2
+.macro SAD_SD_MAD_8x16BYTES
+    vld1.32 {q0}, [$0], $2
+    vld1.32 {q1}, [$1], $2
 
-	vpaddl.u8 q2, q0
-	vpaddl.u8 q3, q1
+    vpaddl.u8 q2, q0
+    vpaddl.u8 q3, q1
 
-	vabd.u8 $3, q0, q1
-	vpaddl.u8 $4, $3       //abs_diff
+    vabd.u8 $3, q0, q1
+    vpaddl.u8 $4, $3       //abs_diff
 
 
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
@@ -192,41 +192,41 @@
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
     SAD_SD_MAD_16BYTES $0,$1,$2,q2,q3,$3,$4
 
-	vsub.u16 $5, q2, q3
+    vsub.u16 $5, q2, q3
 .endm
 
-.macro  SAD_SD_MAD_CALC
-	vpmax.u8 d0, $0, $1 //8bytes
-	vpmax.u8 d0, d0, d0 //4bytes
-	vpmax.u8 $2, d0, d0 //2bytes
+.macro SAD_SD_MAD_CALC
+    vpmax.u8 d0, $0, $1 //8bytes
+    vpmax.u8 d0, d0, d0 //4bytes
+    vpmax.u8 $2, d0, d0 //2bytes
 
-	vpaddl.u16 $3, $3
-	vpaddl.u32 $3, $3
-	vpaddl.s16 $4, $4
-	vpaddl.s32 $4, $4
+    vpaddl.u16 $3, $3
+    vpaddl.u32 $3, $3
+    vpaddl.s16 $4, $4
+    vpaddl.s32 $4, $4
 .endm
 #else
-.macro  SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
-	vld1.32 {q0}, [\arg0], \arg2
-	vld1.32 {q1}, [\arg1], \arg2
+.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vld1.32 {q0}, [\arg0], \arg2
+    vld1.32 {q1}, [\arg1], \arg2
 
-	vpadal.u8 \arg3, q0
-	vpadal.u8 \arg4, q1
+    vpadal.u8 \arg3, q0
+    vpadal.u8 \arg4, q1
 
-	vabd.u8 q0, q0, q1
-	vmax.u8 \arg5, q0
-	vpadal.u8 \arg6, q0
+    vabd.u8 q0, q0, q1
+    vmax.u8 \arg5, q0
+    vpadal.u8 \arg6, q0
 .endm
 
-.macro  SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
-	vld1.32 {q0}, [\arg0], \arg2
-	vld1.32 {q1}, [\arg1], \arg2
+.macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
+    vld1.32 {q0}, [\arg0], \arg2
+    vld1.32 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q2, q0
-	vpaddl.u8 q3, q1
+    vpaddl.u8 q2, q0
+    vpaddl.u8 q3, q1
 
-	vabd.u8 \arg3, q0, q1
-	vpaddl.u8 \arg4, \arg3       //abs_diff
+    vabd.u8 \arg3, q0, q1
+    vpaddl.u8 \arg4, \arg3       //abs_diff
 
 
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
@@ -237,18 +237,18 @@
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
     SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
 
-	vsub.u16 \arg5, q2, q3
+    vsub.u16 \arg5, q2, q3
 .endm
 
-.macro  SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
-	vpmax.u8 d0, \arg0, \arg1 //8bytes
-	vpmax.u8 d0, d0, d0 //4bytes
-	vpmax.u8 \arg2, d0, d0 //2bytes
+.macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
+    vpmax.u8 d0, \arg0, \arg1 //8bytes
+    vpmax.u8 d0, d0, d0 //4bytes
+    vpmax.u8 \arg2, d0, d0 //2bytes
 
-	vpaddl.u16 \arg3, \arg3
-	vpaddl.u32 \arg3, \arg3
-	vpaddl.s16 \arg4, \arg4
-	vpaddl.s32 \arg4, \arg4
+    vpaddl.u16 \arg3, \arg3
+    vpaddl.u32 \arg3, \arg3
+    vpaddl.s16 \arg4, \arg4
+    vpaddl.s32 \arg4, \arg4
 .endm
 #endif
 
@@ -256,18 +256,18 @@
 
     stmdb sp!, {r4-r10}
 
-	ldr r4, [sp, #28] //load pic_stride
-	ldr r5, [sp, #36] //load psad8x8
+    ldr r4, [sp, #28] //load pic_stride
+    ldr r5, [sp, #36] //load psad8x8
     ldr r6, [sp, #40] //load psd8x8
     ldr r7, [sp, #44] //load pmad8x8
 
-	//Initial the Q4 register for save the "psadframe"
-	vmov.s64 q15, #0
+    //Initial the Q4 register for save the "psadframe"
+    vmov.s64 q15, #0
 
-	//Get the jump distance to use on loop codes
-	lsl r10, r4, #4
-	sub r9, r10, #16 //R9 keep the 16*pic_stride-16
-	sub r10, r2      //R10 keep the 16*pic_stride-pic_width
+    //Get the jump distance to use on loop codes
+    lsl r10, r4, #4
+    sub r9, r10, #16 //R9 keep the 16*pic_stride-16
+    sub r10, r2      //R10 keep the 16*pic_stride-pic_width
 
 vaa_calc_sad_bgd_loop0:
 
@@ -276,384 +276,384 @@
 
 vaa_calc_sad_bgd_loop1:
 
-	//Process the 16x16 bytes        pmad psad psd
-	SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
-	SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
+    //Process the 16x16 bytes        pmad psad psd
+    SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
+    SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
 
     SAD_SD_MAD_CALC d26, d27, d16, q11, q9
     SAD_SD_MAD_CALC d28, d29, d17, q12, q10
 
-	//Write to "psad8x8" buffer
-	vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
-	//Adjust the input address
-	sub r0, r9
-	sub r1, r9
-	//Write to "psd8x8" buffer
-	vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
-	subs r8, #16
-	//Write to "pmad8x8" buffer
-	vst2.16 {d16[0],d17[0]}, [r7]!
-	//Save to calculate "psadframe"
-	vadd.u32 q11, q12
-	vadd.u32 q15, q11
+    //Write to "psad8x8" buffer
+    vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
+    //Adjust the input address
+    sub r0, r9
+    sub r1, r9
+    //Write to "psd8x8" buffer
+    vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
+    subs r8, #16
+    //Write to "pmad8x8" buffer
+    vst2.16 {d16[0],d17[0]}, [r7]!
+    //Save to calculate "psadframe"
+    vadd.u32 q11, q12
+    vadd.u32 q15, q11
 
-	bne vaa_calc_sad_bgd_loop1
+    bne vaa_calc_sad_bgd_loop1
 
-	//Adjust the input address
-	add r0, r10
-	add r1, r10
+    //Adjust the input address
+    add r0, r10
+    add r1, r10
 
     subs r3, #16
-	bne vaa_calc_sad_bgd_loop0
+    bne vaa_calc_sad_bgd_loop0
 
-	ldr r8, [sp, #32] //load psadframe
-	vadd.u32 d30, d31
-	vst1.32 {d30[0]}, [r8]
-	ldmia sp!, {r4-r10}
+    ldr r8, [sp, #32] //load psadframe
+    vadd.u32 d30, d31
+    vst1.32 {d30[0]}, [r8]
+    ldmia sp!, {r4-r10}
 
 WELS_ASM_FUNC_END
 
 
 #ifdef __APPLE__
-.macro  SSD_MUL_SUM_16BYTES_RESET
-	vmull.u8 $3, $0, $0
-	vpaddl.u16 $2, $3
+.macro SSD_MUL_SUM_16BYTES_RESET
+    vmull.u8 $3, $0, $0
+    vpaddl.u16 $2, $3
 
-	vmull.u8 $3, $1, $1
-	vpadal.u16 $2, $3
+    vmull.u8 $3, $1, $1
+    vpadal.u16 $2, $3
 .endm
 
-.macro  SSD_MUL_SUM_16BYTES
-	vmull.u8 $3, $0, $0
-	vpadal.u16 $2, $3
+.macro SSD_MUL_SUM_16BYTES
+    vmull.u8 $3, $0, $0
+    vpadal.u16 $2, $3
 
-	vmull.u8 $3, $1, $1
-	vpadal.u16 $2, $3
+    vmull.u8 $3, $1, $1
+    vpadal.u16 $2, $3
 .endm
 
 .macro SAD_SSD_BGD_16
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [$1], $2 //load ref_row
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vld1.8 {q1}, [$1], $2 //load ref_row
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //the last row of a 16x16 block
 .macro SAD_SSD_BGD_16_end
-	vld1.8 {q0}, [$0], $1 //load cur_row
+    vld1.8 {q0}, [$0], $1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 $2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $2, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 8x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_8x8
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [$1], $2 //load ref_row
+    vld1.8 {q1}, [$1], $2 //load ref_row
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 16x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_16x16
-	vld1.8 {q0}, [$0], $2 //load cur_row
-	vld1.8 {q1}, [$1], $2 //load ref_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q1}, [$1], $2 //load ref_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [$1], $2 //load ref_row
+    vld1.8 {q1}, [$1], $2 //load ref_row
 
-	vpaddl.u8 q9, q0								//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                                //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for each 8x16 block
 .macro SAD_SSD_BGD_CALC_8x16
 
-	vpmax.u8 d10, d10, d11 //4 numbers
-	vpmax.u8 d10, d10, d10 //2 numbers
-	vpmax.u8 d10, d10, d10 //1 number1
+    vpmax.u8 d10, d10, d11 //4 numbers
+    vpmax.u8 d10, d10, d10 //2 numbers
+    vpmax.u8 d10, d10, d10 //1 number1
 
-	vmov $0, d10			//d26 d27 keeps the l_mad
+    vmov $0, d10            //d26 d27 keeps the l_mad
 
-	//p_sd8x8			fix me
-	vpaddl.u16 q3, q3
-	vpaddl.u16 q4, q4
+    //p_sd8x8           fix me
+    vpaddl.u16 q3, q3
+    vpaddl.u16 q4, q4
 
-	vsub.i32 $1, q3, q4
-	vpaddl.u32 $1, $1
+    vsub.i32 $1, q3, q4
+    vpaddl.u32 $1, $1
 
-	//psad8x8
-	vpaddl.u16 $2, $2
-	vpaddl.u32 $2, $2
+    //psad8x8
+    vpaddl.u16 $2, $2
+    vpaddl.u32 $2, $2
 
-	//psadframe
-	vadd.i32 q12, $2
+    //psadframe
+    vadd.i32 q12, $2
 .endm
 
 .macro SAD_SSD_BGD_16x16
-	//for one 8x16
-	SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
-	SAD_SSD_BGD_16 $0, $1, $2, q6
+    //for one 8x16
+    SAD_SSD_BGD_16_RESET_16x16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
+    SAD_SSD_BGD_16 $0, $1, $2, q6
 
-	SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+    SAD_SSD_BGD_CALC_8x16 d26, q14, q6
 
-	//for another 8x16
-	SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16 $0, $1, $2, q7
-	SAD_SSD_BGD_16_end $0, $2, q7
+    //for another 8x16
+    SAD_SSD_BGD_16_RESET_8x8 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16 $0, $1, $2, q7
+    SAD_SSD_BGD_16_end $0, $2, q7
 
-	SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+    SAD_SSD_BGD_CALC_8x16 d27, q15, q7
 .endm
 
-.macro  SSD_SAD_SD_MAD_PADDL
-	vpaddl.s16 $0, $0
-	vpaddl.s32 $0, $0
-	vadd.i32 $1, $1, $2
+.macro SSD_SAD_SD_MAD_PADDL
+    vpaddl.s16 $0, $0
+    vpaddl.s32 $0, $0
+    vadd.i32 $1, $1, $2
 .endm
 #else
-.macro  SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
-	vmull.u8   \arg3, \arg0, \arg0
-	vpaddl.u16 \arg2, \arg3
+.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
+    vmull.u8   \arg3, \arg0, \arg0
+    vpaddl.u16 \arg2, \arg3
 
-	vmull.u8   \arg3, \arg1, \arg1
-	vpadal.u16 \arg2, \arg3
+    vmull.u8   \arg3, \arg1, \arg1
+    vpadal.u16 \arg2, \arg3
 .endm
 
-.macro  SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
-	vmull.u8   \arg3, \arg0, \arg0
-	vpadal.u16 \arg2, \arg3
+.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
+    vmull.u8   \arg3, \arg0, \arg0
+    vpadal.u16 \arg2, \arg3
 
-	vmull.u8   \arg3, \arg1, \arg1
-	vpadal.u16 \arg2, \arg3
+    vmull.u8   \arg3, \arg1, \arg1
+    vpadal.u16 \arg2, \arg3
 .endm
 
 .macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //the last row of a 16x16 block
 .macro SAD_SSD_BGD_16_end arg0, arg1, arg2
-	vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmax.u8 q5, q2								//l_mad for 16 bytes reset for every 8x16
+    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
 
-	vpadal.u8 \arg2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg2, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 8x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
 
-	SSD_MUL_SUM_16BYTES d4,d5, q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for the begin of a 16x16 block, use some instructions to reset the register
 .macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
+    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 
-	vld1.8 {q1}, [\arg1], \arg2 //load ref_row
+    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
 
-	vpaddl.u8 q9, q0								//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                                //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 //for each 8x16 block
 .macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
 
-	vpmax.u8 d10, d10, d11 //4 numbers
-	vpmax.u8 d10, d10, d10 //2 numbers
-	vpmax.u8 d10, d10, d10 //1 number1
+    vpmax.u8 d10, d10, d11 //4 numbers
+    vpmax.u8 d10, d10, d10 //2 numbers
+    vpmax.u8 d10, d10, d10 //1 number1
 
-	vmov \arg0, d10			//d26 d27 keeps the l_mad
+    vmov \arg0, d10         //d26 d27 keeps the l_mad
 
-	//p_sd8x8
-	vpaddl.u16 q3, q3
-	vpaddl.u16 q4, q4
+    //p_sd8x8
+    vpaddl.u16 q3, q3
+    vpaddl.u16 q4, q4
 
-	vsub.i32 \arg1, q3, q4
-	vpaddl.u32 \arg1, \arg1
+    vsub.i32 \arg1, q3, q4
+    vpaddl.u32 \arg1, \arg1
 
-	//psad8x8
-	vpaddl.u16 \arg2, \arg2
-	vpaddl.u32 \arg2, \arg2
+    //psad8x8
+    vpaddl.u16 \arg2, \arg2
+    vpaddl.u32 \arg2, \arg2
 
-	//psadframe
-	vadd.i32 q12, \arg2
+    //psadframe
+    vadd.i32 q12, \arg2
 .endm
 
 .macro SAD_SSD_BGD_16x16 arg0, arg1, arg2
-	//for one 8x16
-	SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    //for one 8x16
+    SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
 
-	SAD_SSD_BGD_CALC_8x16 d26, q14, q6
+    SAD_SSD_BGD_CALC_8x16 d26, q14, q6
 
-	//for another 8x16
-	SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_BGD_16_end \arg0, \arg2, q7
+    //for another 8x16
+    SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_BGD_16_end \arg0, \arg2, q7
 
-	SAD_SSD_BGD_CALC_8x16 d27, q15, q7
+    SAD_SSD_BGD_CALC_8x16 d27, q15, q7
 .endm
 
-.macro  SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
-	vpaddl.s16 \arg0, \arg0
-	vpaddl.s32 \arg0, \arg0
-	vadd.i32 \arg1, \arg1, \arg2
+.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
+    vpaddl.s16 \arg0, \arg0
+    vpaddl.s32 \arg0, \arg0
+    vadd.i32 \arg1, \arg1, \arg2
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
-	stmdb sp!, {r0-r12, r14}
-	vpush {q4-q7}
+    stmdb sp!, {r0-r12, r14}
+    vpush {q4-q7}
 
-	ldr r4, [sp, #120] //r4 keeps the pic_stride
+    ldr r4, [sp, #120] //r4 keeps the pic_stride
 
-	sub r5, r4, #1
-	lsl r5, r5, #4 //r5 keeps the little step
+    sub r5, r4, #1
+    lsl r5, r5, #4 //r5 keeps the little step
 
-	lsl r6, r4, #4
-	sub r6, r2, r6	//r6 keeps the big step
+    lsl r6, r4, #4
+    sub r6, r2, r6  //r6 keeps the big step
 
 
-	ldr r8, [sp, #128]//psad8x8
-	ldr r9, [sp, #132]//psum16x16
-	ldr r10, [sp, #136]//psqsum16x16
-	ldr r11, [sp, #140]//psqdiff16x16
-	ldr r12, [sp, #144]//p_sd8x8
-	ldr r14, [sp, #148]//p_mad8x8
+    ldr r8, [sp, #128]//psad8x8
+    ldr r9, [sp, #132]//psum16x16
+    ldr r10, [sp, #136]//psqsum16x16
+    ldr r11, [sp, #140]//psqdiff16x16
+    ldr r12, [sp, #144]//p_sd8x8
+    ldr r14, [sp, #148]//p_mad8x8
 
-	vmov.i8 q12, #0
+    vmov.i8 q12, #0
 
 vaa_calc_sad_ssd_bgd_height_loop:
 
@@ -660,7 +660,7 @@
     mov r7, r2
 vaa_calc_sad_ssd_bgd_width_loop:
 
-    //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff	q8, l_sum q9, l_sqsum q10
+    //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff  q8, l_sum q9, l_sqsum q10
     SAD_SSD_BGD_16x16 r0,r1,r4
 
     //psad8x8
@@ -694,20 +694,20 @@
 
     bne vaa_calc_sad_ssd_bgd_width_loop
 
-    sub r0, r0, r6		//jump to next 16 x width
-    sub r1, r1, r6		//jump to next 16 x width
+    sub r0, r0, r6      //jump to next 16 x width
+    sub r1, r1, r6      //jump to next 16 x width
 
     subs r3, #16
 bne vaa_calc_sad_ssd_bgd_height_loop
 
-	//psadframe
-	ldr r7, [sp, #124]//psadframe
+    //psadframe
+    ldr r7, [sp, #124]//psadframe
 
-	vadd.i32 d24, d24, d25
-	vst1.32 {d24[0]}, [r7]
+    vadd.i32 d24, d24, d25
+    vst1.32 {d24[0]}, [r7]
 
-	vpop {q4-q7}
-	ldmia sp!, {r0-r12, r14}
+    vpop {q4-q7}
+    ldmia sp!, {r0-r12, r14}
 
 WELS_ASM_FUNC_END
 
@@ -714,223 +714,223 @@
 
 #ifdef __APPLE__
 .macro SAD_VAR_16
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q1}, [$1], $2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_END
-	vld1.8 {q0}, [$0], $1 //load cur_row
+    vld1.8 {q0}, [$0], $1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 $2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 $2, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_RESET_16x16
-	vld1.8 {q0}, [$0], $2 //load cur_row
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q1}, [$1], $2
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q1}, [$1], $2
 
-	vpaddl.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+    SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
 .endm
 
 .macro SAD_VAR_16_RESET_8x8
-	vld1.8 {q0}, [$0], $2 //load cur_row
+    vld1.8 {q0}, [$0], $2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 $3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 $3, q2                            //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [$1], $2
+    vld1.8 {q1}, [$1], $2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16x16
-	//for one 8x16
-	SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
-	SAD_VAR_16 $0, $1, $2, q6
+    //for one 8x16
+    SAD_VAR_16_RESET_16x16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
+    SAD_VAR_16 $0, $1, $2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16 $0, $1, $2, q7
-	SAD_VAR_16_END $0, $2, q7
+    //for another 8x16
+    SAD_VAR_16_RESET_8x8 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16 $0, $1, $2, q7
+    SAD_VAR_16_END $0, $2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #else
 .macro SAD_VAR_16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16_END arg0, arg1, arg2
-	vld1.8 {q0}, [\arg0], \arg1 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg1 //load cur_row
 
-	vpadal.u8 q3, q0	//add cur_row together
-	vpadal.u8 q4, q1	//add ref_row together
+    vpadal.u8 q3, q0    //add cur_row together
+    vpadal.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpadal.u8 \arg2, q2							//l_sad for 16 bytes reset for every 8x16
+    vpadal.u8 \arg2, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 
 .macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpaddl.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpaddl.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
+    SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
 .endm
 
 .macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
-	vld1.8 {q0}, [\arg0], \arg2 //load cur_row
+    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
 
-	vpaddl.u8 q3, q0	//add cur_row together
-	vpaddl.u8 q4, q1	//add ref_row together
+    vpaddl.u8 q3, q0    //add cur_row together
+    vpaddl.u8 q4, q1    //add ref_row together
 
-	vabd.u8 q2, q0, q1	//abs_diff
+    vabd.u8 q2, q0, q1  //abs_diff
 
-	vpaddl.u8 \arg3, q2							//l_sad for 16 bytes reset for every 8x16
+    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
 
-	vld1.8 {q1}, [\arg1], \arg2
+    vld1.8 {q1}, [\arg1], \arg2
 
-	vpadal.u8 q9, q0							//q9 for l_sum		reset for every 16x16
+    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
 
-	SSD_MUL_SUM_16BYTES d0,d1, q10, q11			//q10 for lsqsum	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
 .endm
 
 .macro SAD_VAR_16x16 arg0, arg1, arg2
-	//for one 8x16
-	SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
-	SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    //for one 8x16
+    SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
+    SAD_VAR_16 \arg0, \arg1, \arg2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16 \arg0, \arg1, \arg2, q7
-	SAD_VAR_16_END \arg0, \arg2, q7
+    //for another 8x16
+    SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16 \arg0, \arg1, \arg2, q7
+    SAD_VAR_16_END \arg0, \arg2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
-	stmdb sp!, {r4-r11}
-	vpush {q4}
-	vpush {q6-q7}
+    stmdb sp!, {r4-r11}
+    vpush {q4}
+    vpush {q6-q7}
 
-	ldr r4, [sp, #80] //r4 keeps the pic_stride
+    ldr r4, [sp, #80] //r4 keeps the pic_stride
 
-	sub r5, r4, #1
-	lsl r5, r5, #4 //r5 keeps the little step
+    sub r5, r4, #1
+    lsl r5, r5, #4 //r5 keeps the little step
 
-	lsl r6, r4, #4
-	sub r6, r2, r6	//r6 keeps the big step
+    lsl r6, r4, #4
+    sub r6, r2, r6  //r6 keeps the big step
 
-	ldr r7,		[sp, #84]	//psadframe
-	ldr r8,		[sp, #88]	//psad8x8
-	ldr r9,		[sp, #92]	//psum16x16
-	ldr r10,	[sp, #96]	//psqsum16x16
+    ldr r7,     [sp, #84]   //psadframe
+    ldr r8,     [sp, #88]   //psad8x8
+    ldr r9,     [sp, #92]   //psum16x16
+    ldr r10,    [sp, #96]   //psqsum16x16
 
-	vmov.i8 q12, #0
+    vmov.i8 q12, #0
 vaa_calc_sad_var_height_loop:
 
     mov r11, r2
@@ -956,154 +956,154 @@
 
     bne vaa_calc_sad_var_width_loop
 
-    sub r0, r0, r6		//jump to next 16 x width
-    sub r1, r1, r6		//jump to next 16 x width
+    sub r0, r0, r6      //jump to next 16 x width
+    sub r1, r1, r6      //jump to next 16 x width
 
     subs r3, #16
 bne vaa_calc_sad_var_height_loop
 
-	vadd.i32 d24, d24, d25
-	vst1.32 {d24[0]}, [r7]
+    vadd.i32 d24, d24, d25
+    vst1.32 {d24[0]}, [r7]
 
-	vpop {q6-q7}
-	vpop {q4}
-	ldmia sp!, {r4-r11}
+    vpop {q6-q7}
+    vpop {q4}
+    ldmia sp!, {r4-r11}
 WELS_ASM_FUNC_END
 
 
 #ifdef __APPLE__
 .macro SAD_SSD_16
-	SAD_VAR_16 $0, $1, $2, $3
+    SAD_VAR_16 $0, $1, $2, $3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11
 .endm
 
 .macro SAD_SSD_16_END
-	SAD_VAR_16_END $0, $1, $2
+    SAD_VAR_16_END $0, $1, $2
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_16x16
-	SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
+    SAD_VAR_16_RESET_16x16 $0, $1, $2, $3
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_8x8
-	SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
+    SAD_VAR_16_RESET_8x8 $0, $1, $2, $3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16x16
-	//for one 8x16
-	SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
-	SAD_SSD_16 $0, $1, $2, q6
+    //for one 8x16
+    SAD_SSD_16_RESET_16x16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
+    SAD_SSD_16 $0, $1, $2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16 $0, $1, $2, q7
-	SAD_SSD_16_END $0, $2, q7
+    //for another 8x16
+    SAD_SSD_16_RESET_8x8 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16 $0, $1, $2, q7
+    SAD_SSD_16_END $0, $2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #else
 .macro SAD_SSD_16 arg0, arg1, arg2, arg3
-	SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
+    SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11
 .endm
 
 .macro SAD_SSD_16_END arg0, arg1, arg2
-	SAD_VAR_16_END \arg0, \arg1, \arg2
+    SAD_VAR_16_END \arg0, \arg1, \arg2
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
-	SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
+    SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
 
-	SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
-	SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
+    SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
 
-	SSD_MUL_SUM_16BYTES d4,d5,q8, q11			//q8 for l_sqiff	reset for every 16x16
+    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
 .endm
 
 .macro SAD_SSD_16x16 arg0, arg1, arg2
-	//for one 8x16
-	SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
-	SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    //for one 8x16
+    SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
+    SAD_SSD_16 \arg0, \arg1, \arg2, q6
 
-	vpaddl.u16 q6, q6
-	vpaddl.u32 q6, q6
-	vadd.i32 q12, q6
+    vpaddl.u16 q6, q6
+    vpaddl.u32 q6, q6
+    vadd.i32 q12, q6
 
-	//for another 8x16
-	SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16 \arg0, \arg1, \arg2, q7
-	SAD_SSD_16_END \arg0, \arg2, q7
+    //for another 8x16
+    SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16 \arg0, \arg1, \arg2, q7
+    SAD_SSD_16_END \arg0, \arg2, q7
 
-	vpaddl.u16 q7, q7
-	vpaddl.u32 q7, q7
+    vpaddl.u16 q7, q7
+    vpaddl.u32 q7, q7
 
-	vadd.i32 q12, q7
+    vadd.i32 q12, q7
 .endm
 #endif
 
 
 WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
-	stmdb sp!, {r4-r12}
-	vpush {q4}
-	vpush {q6-q7}
+    stmdb sp!, {r4-r12}
+    vpush {q4}
+    vpush {q6-q7}
 
-	ldr r4, [sp, #84] //r4 keeps the pic_stride
+    ldr r4, [sp, #84] //r4 keeps the pic_stride
 
-	sub r5, r4, #1
-	lsl r5, r5, #4 //r5 keeps the little step
+    sub r5, r4, #1
+    lsl r5, r5, #4 //r5 keeps the little step
 
-	lsl r6, r4, #4
-	sub r6, r2, r6	//r6 keeps the big step
+    lsl r6, r4, #4
+    sub r6, r2, r6  //r6 keeps the big step
 
-	ldr r7,		[sp, #88]	//psadframe
-	ldr r8,		[sp, #92]	//psad8x8
-	ldr r9,		[sp, #96]	//psum16x16
-	ldr r10,	[sp, #100]	//psqsum16x16
-	ldr r11,	[sp, #104]	//psqdiff16x16
+    ldr r7,     [sp, #88]   //psadframe
+    ldr r8,     [sp, #92]   //psad8x8
+    ldr r9,     [sp, #96]   //psum16x16
+    ldr r10,    [sp, #100]  //psqsum16x16
+    ldr r11,    [sp, #104]  //psqdiff16x16
 
-	vmov.i8 q12, #0
+    vmov.i8 q12, #0
 vaa_calc_sad_ssd_height_loop:
 
     mov r12, r2
@@ -1136,18 +1136,18 @@
 
     bne vaa_calc_sad_ssd_width_loop
 
-    sub r0, r0, r6		//jump to next 16 x width
-    sub r1, r1, r6		//jump to next 16 x width
+    sub r0, r0, r6      //jump to next 16 x width
+    sub r1, r1, r6      //jump to next 16 x width
 
     subs r3, #16
-	bne vaa_calc_sad_ssd_height_loop
+    bne vaa_calc_sad_ssd_height_loop
 
-	vadd.i32 d24, d24, d25
-	vst1.32 {d24[0]}, [r7]
+    vadd.i32 d24, d24, d25
+    vst1.32 {d24[0]}, [r7]
 
-	vpop {q6-q7}
-	vpop {q4}
-	ldmia sp!, {r4-r12}
+    vpop {q6-q7}
+    vpop {q4}
+    ldmia sp!, {r4-r12}
 WELS_ASM_FUNC_END
 
 #endif
--- a/codec/processing/src/x86/denoisefilter.asm
+++ b/codec/processing/src/x86/denoisefilter.asm
@@ -56,217 +56,217 @@
 ;***********************************************************************
 SECTION .text
 
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
+%macro WEIGHT_LINE  9
+    movq        %2, %9
+    punpcklbw   %2, %7
+    movdqa      %8, %2
 
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
+    movdqa      %1, %6
+    psubusb     %1, %8
+    psubusb     %8, %6
+    por         %8, %1      ; ABS(curPixel - centerPixel);
 
-		movdqa		%1,	%3
-		psubusb		%1,	%8
+    movdqa      %1, %3
+    psubusb     %1, %8
 
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1
-		paddusw		%4,	%1
-		paddusw		%5,	%2
+    pmullw      %1, %1
+    psrlw       %1, 5
+    pmullw      %2, %1
+    paddusw     %4, %1
+    paddusw     %5, %2
 %endmacro
 
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+%macro WEIGHT_LINE1_UV  4
+    movdqa      %2, %1
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 1
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 3
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    punpcklbw   %2, %4
+    paddw       %3, %2
 %endmacro
 
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+%macro WEIGHT_LINE2_UV  4
+    movdqa      %2, %1
+    punpcklbw   %2, %4
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 1
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    punpcklbw   %2, %4
+    psllw       %2, 2
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 3
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    punpcklbw   %2, %4
+    paddw       %3, %2
 %endmacro
 
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+%macro WEIGHT_LINE3_UV  4
+    movdqa      %2, %1
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 1
+    punpcklbw   %2, %4
+    psllw       %2, 2
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 2
+    punpcklbw   %2, %4
+    pmullw      %2, [sse2_20]
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 3
+    punpcklbw   %2, %4
+    psllw       %2, 2
+    paddw       %3, %2
 
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
+    movdqa      %2, %1
+    psrldq      %2, 4
+    punpcklbw   %2, %4
+    psllw       %2, 1
+    paddw       %3, %2
 %endmacro
 
 ;***********************************************************************
 ;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
 ;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
+;   1   2   3
+;   4   0   5
+;   6   7   8
+;   0:  the center point
 
 WELS_EXTERN BilateralLumaFilter8_sse2
 
-        push r3
-        %assign push_num 1
-        LOAD_2_PARA
-        PUSH_XMM 8
+    push r3
+    %assign push_num 1
+    LOAD_2_PARA
+    PUSH_XMM 8
 
-		pxor		xmm7,	xmm7
+    pxor        xmm7,   xmm7
 
-		mov         r3,     r0
+    mov         r3,     r0
 
-		movq        xmm6,   [r0]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
+    movq        xmm6,   [r0]
+    punpcklbw   xmm6,   xmm7
+    movdqa      xmm3,   [sse2_32]
+    pxor        xmm4,   xmm4        ; nTotWeight
+    pxor        xmm5,   xmm5        ; nSum
 
-        dec         r0
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 5
+    dec         r0
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]           ; pixel 4
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]       ; pixel 5
 
-		sub			r0,	r1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 3
+    sub         r0, r1
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]           ; pixel 1
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]       ; pixel 2
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]       ; pixel 3
 
-		lea			r0,	[r0 + r1 * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]		; pixel 8
+    lea         r0, [r0 + r1 * 2]
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0]           ; pixel 6
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 1]       ; pixel 7
+    WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [r0 + 2]       ; pixel 8
 
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[r3],	xmm5
+    pcmpeqw     xmm0,   xmm0
+    psrlw       xmm0,   15
+    psllw       xmm0,   8
+    psubusw     xmm0,   xmm4
+    pmullw      xmm0,   xmm6
+    paddusw     xmm5,   xmm0
+    psrlw       xmm5,   8
+    packuswb    xmm5,   xmm5
+    movq        [r3],   xmm5
 
 
-		POP_XMM
-		pop r3
-		%assign push_num 0
+    POP_XMM
+    pop r3
+    %assign push_num 0
 
-		ret
+    ret
 
 ;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+; void      WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
 ;***********************************************************************
 ;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
+;1  1   2   1   1
+;1  2   4   2   1
+;2  4   20  4   2
+;1  2   4   2   1
+;1  1   2   1   1
 
 WELS_EXTERN WaverageChromaFilter8_sse2
 
-        push r3
+    push r3
 
-        %assign push_num 1
+    %assign push_num 1
 
-        LOAD_2_PARA
+    LOAD_2_PARA
 
-        mov		r3,	r1
-		add		r3,	r3
-		sub		r0,	r3			; pixels - 2 * stride
-		sub		r0,	2
+    mov     r3, r1
+    add     r3, r3
+    sub     r0, r3          ; pixels - 2 * stride
+    sub     r0, 2
 
-		pxor	xmm0,	xmm0
-		pxor	xmm3,	xmm3
+    pxor    xmm0,   xmm0
+    pxor    xmm3,   xmm3
 
-		movdqu		xmm1,	[r0]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0]
+    WEIGHT_LINE1_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		movdqu		xmm1,	[r0 + r1]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0 + r1]
+    WEIGHT_LINE2_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		add		r0,	r3
-		movdqu		xmm1,	[r0]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+    add     r0, r3
+    movdqu      xmm1,   [r0]
+    WEIGHT_LINE3_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		movdqu		xmm1,	[r0 + r1]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0 + r1]
+    WEIGHT_LINE2_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		movdqu		xmm1,	[r0 + r1 * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+    movdqu      xmm1,   [r0 + r1 * 2]
+    WEIGHT_LINE1_UV xmm1,   xmm2,   xmm3,   xmm0
 
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[r0 + 2],		xmm3
+    psrlw       xmm3,       6
+    packuswb    xmm3,       xmm3
+    movq        [r0 + 2],       xmm3
 
 
-        pop r3
+    pop r3
 
-        %assign push_num 0
-		ret
+    %assign push_num 0
+    ret
--- a/codec/processing/src/x86/downsample_bilinear.asm
+++ b/codec/processing/src/x86/downsample_bilinear.asm
@@ -29,13 +29,13 @@
 ;*     POSSIBILITY OF SUCH DAMAGE.
 ;*
 ;*
-;*	upsampling.asm
+;*  upsampling.asm
 ;*
 ;*  Abstract
-;*		SIMD for pixel domain down sampling
+;*      SIMD for pixel domain down sampling
 ;*
 ;*  History
-;*		10/22/2009	Created
+;*      10/22/2009  Created
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -61,9 +61,9 @@
 
 ALIGN 16
 shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+    db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
 shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+    db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
 
 
 ;***********************************************************************
@@ -73,737 +73,737 @@
 SECTION .text
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01			; iSrcHeight >> 1
+    sar ebp, $01            ; iSrcHeight >> 1
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01            ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg ebx             ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 32 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [esi]         ; 1st pSrc line
+    movq mm1, [esi+8]       ; 1st pSrc line + 8
+    movq mm2, [esi+ecx]     ; 2nd pSrc line
+    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8
 
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+    ; to handle mm0, mm1, mm2, mm3
+    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm4, mm5      ; d c D C b a B A
+    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
 
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm5, mm6      ; h g H G f e F E
+    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
 
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
+    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
+    punpcklbw mm6, mm7      ; l k L K j i J I
+    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6
 
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
+    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
+    punpcklbw mm7, mm0      ; p o P O n m N M
+    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7
 
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
+    ; to handle mm4, mm5, mm6, mm7
+    movq mm0, mm4       ;
+    punpckldq mm0, mm5  ; H G F E D C B A
+    punpckhdq mm4, mm5  ; h g f e d c b a
 
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
+    movq mm1, mm6
+    punpckldq mm1, mm7  ; P O N M L K J I
+    punpckhdq mm6, mm7  ; p o n m l k j i
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+    ; 2nd part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm1, [esi+16]      ; 1st pSrc line + 16
+    movq mm2, [esi+24]      ; 1st pSrc line + 24
+    movq mm3, [esi+ecx+16]  ; 2nd pSrc line + 16
+    movq mm4, [esi+ecx+24]  ; 2nd pSrc line + 24
 
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+    ; to handle mm1, mm2, mm3, mm4
+    pshufw mm5, mm1, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm6, mm5, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm5, mm6      ; d c D C b a B A
+    pshufw mm5, mm5, 0d8h   ; d c b a D C B A ; 11011000 B: mm5
 
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+    pshufw mm6, mm2, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm7, mm6, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm6, mm7      ; h g H G f e F E
+    pshufw mm6, mm6, 0d8h   ; h g f e H G F E ; 11011000 B: mm6
 
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+    pshufw mm7, mm3, 0d8h   ; l L j J k K i I ; 11011000 B
+    pshufw mm1, mm7, 04eh   ; k K i I l L j J ; 01001110 B
+    punpcklbw mm7, mm1      ; l k L K j i J I
+    pshufw mm7, mm7, 0d8h   ; l k j i L K J I ; 11011000 B: mm7
 
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+    pshufw mm1, mm4, 0d8h   ; p P n N o O m M ; 11011000 B
+    pshufw mm2, mm1, 04eh   ; o O m M p P n N ; 01001110 B
+    punpcklbw mm1, mm2      ; p o P O n m N M
+    pshufw mm1, mm1, 0d8h   ; p o n m P O N M ; 11011000 B: mm1
 
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
+    ; to handle mm5, mm6, mm7, mm1
+    movq mm2, mm5
+    punpckldq mm2, mm6  ; H G F E D C B A
+    punpckhdq mm5, mm6  ; h g f e d c b a
 
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
+    movq mm3, mm7
+    punpckldq mm3, mm1  ; P O N M L K J I
+    punpckhdq mm7, mm1  ; p o n m l k j i
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm2, mm5      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm3, mm7      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm2, mm3      ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
 
-	movq [edi  ], mm0
-	movq [edi+8], mm2
+    movq [edi  ], mm0
+    movq [edi+8], mm2
 
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
+    ; next SMB
+    lea esi, [esi+32]
+    lea edi, [edi+16]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    WELSEMMS
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
+    sar ebp, $01        ; iSrcHeight >> 1
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 16 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A    mm1: h H g G f F e E
+    ;2nd Line Src:  mm2: l L k K j J i I    mm3: p P o O n N m M
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [esi]         ; 1st pSrc line
+    movq mm1, [esi+8]       ; 1st pSrc line + 8
+    movq mm2, [esi+ecx]     ; 2nd pSrc line
+    movq mm3, [esi+ecx+8]   ; 2nd pSrc line + 8
 
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+    ; to handle mm0, mm1, mm2, mm3
+    pshufw mm4, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm5, mm4, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm4, mm5      ; d c D C b a B A
+    pshufw mm4, mm4, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
 
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+    pshufw mm5, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm6, mm5, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm5, mm6      ; h g H G f e F E
+    pshufw mm5, mm5, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
 
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+    pshufw mm6, mm2, 0d8h   ; l L j J k K i I ; 11011000 B
+    pshufw mm7, mm6, 04eh   ; k K i I l L j J ; 01001110 B
+    punpcklbw mm6, mm7      ; l k L K j i J I
+    pshufw mm6, mm6, 0d8h   ; l k j i L K J I ; 11011000 B: mm6
 
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+    pshufw mm7, mm3, 0d8h   ; p P n N o O m M ; 11011000 B
+    pshufw mm0, mm7, 04eh   ; o O m M p P n N ; 01001110 B
+    punpcklbw mm7, mm0      ; p o P O n m N M
+    pshufw mm7, mm7, 0d8h   ; p o n m P O N M ; 11011000 B: mm7
 
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		;
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
+    ; to handle mm4, mm5, mm6, mm7
+    movq mm0, mm4       ;
+    punpckldq mm0, mm5  ; H G F E D C B A
+    punpckhdq mm4, mm5  ; h g f e d c b a
 
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
+    movq mm1, mm6
+    punpckldq mm1, mm7  ; P O N M L K J I
+    punpckhdq mm6, mm7  ; p o n m l k j i
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm0, mm4      ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+    pavgb mm1, mm6      ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	movq [edi  ], mm0
+    movq [edi  ], mm0
 
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
+    ; next SMB
+    lea esi, [esi+16]
+    lea edi, [edi+8]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    WELSEMMS
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
+    sar ebp, $01        ; iSrcHeight >> 1
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $02		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $02        ; (iSrcWidth >> 1) / 4      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 8 bytes
 .xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+ecx]		; 2nd pSrc line
+    ; 1st part horizonal loop: x8 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  mm0: d D c C b B a A
+    ;2nd Line Src:  mm1: h H g G f F e E
+    ;=> target:
+    ;: H G F E D C B A
+    ;: h g f e d c b a
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movq mm0, [esi]         ; 1st pSrc line
+    movq mm1, [esi+ecx]     ; 2nd pSrc line
 
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+    ; to handle mm0, mm1, mm2, mm3
+    pshufw mm2, mm0, 0d8h   ; d D b B c C a A ; 11011000 B
+    pshufw mm3, mm2, 04eh   ; c C a A d D b B ; 01001110 B
+    punpcklbw mm2, mm3      ; d c D C b a B A
+    pshufw mm2, mm2, 0d8h   ; d c b a D C B A ; 11011000 B: mm4
 
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+    pshufw mm4, mm1, 0d8h   ; h H f F g G e E ; 11011000 B
+    pshufw mm5, mm4, 04eh   ; g G e E h H f F ; 01001110 B
+    punpcklbw mm4, mm5      ; h g H G f e F E
+    pshufw mm4, mm4, 0d8h   ; h g f e H G F E ; 11011000 B: mm5
 
-	; to handle mm2, mm4
-	movq mm0, mm2		;
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
+    ; to handle mm2, mm4
+    movq mm0, mm2       ;
+    punpckldq mm0, mm4  ; H G F E D C B A
+    punpckhdq mm2, mm4  ; h g f e d c b a
 
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+    ; avg within MB horizon width (16 x 2 lines)
+    pavgb mm0, mm2      ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+    pshufw mm1, mm0, 04eh   ; 01001110 B
+    pavgb mm0, mm1      ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	movd [edi],	mm0
+    movd [edi], mm0
 
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
+    ; next unit
+    lea esi, [esi+8]
+    lea edi, [edi+4]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    WELSEMMS
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 
 
 ; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx32_ssse3(   unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01			; iSrcHeight >> 1
+    sar ebp, $01            ; iSrcHeight >> 1
 
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01            ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg ebx             ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 32 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
+    ;               xmm1: p P o O n N m M l L k K j J i I
+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
+    ;               xmm3: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: P O N M L K J I H G F E D C B A
+    ;: p o n m l k j i h g f e d c b a
+    ;: P ..                          A
+    ;: p ..                          a
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movdqa xmm0, [esi]          ; 1st_src_line
+    movdqa xmm1, [esi+16]       ; 1st_src_line + 16
+    movdqa xmm2, [esi+ecx]      ; 2nd_src_line
+    movdqa xmm3, [esi+ecx+16]   ; 2nd_src_line + 16
 
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
+    ; packing & avg
+    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    ; another implementation for xmm4 high bits
+;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm4
 
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
+    movdqa xmm5, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm1
+;   psrlw xmm5, 8
+    pavgb xmm1, xmm5
 
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
+    movdqa xmm4, xmm2
+    pshufb xmm2, xmm7
+    pshufb xmm4, xmm6
+;   psubb xmm4, xmm2
+;   psrlw xmm4, 8
+    pavgb xmm2, xmm4
 
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
+    movdqa xmm5, xmm3
+    pshufb xmm3, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm3
+;   psrlw xmm5, 8
+    pavgb xmm3, xmm5
 
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
+    packuswb xmm0, xmm1
+    packuswb xmm2, xmm3
+    pavgb xmm0, xmm2
 
-	; write pDst
-	movdqa [edi], xmm0
+    ; write pDst
+    movdqa [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
+    ; next SMB
+    lea esi, [esi+32]
+    lea edi, [edi+16]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    sar ebp, $01        ; iSrcHeight >> 1
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 16 bytes
 .xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
+    ; horizonal loop: x16 bytes by source
+    ;               mem  hi<-       ->lo
+    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movdqa xmm0, [esi]          ; 1st_src_line
+    movdqa xmm1, [esi+ecx]      ; 2nd_src_line
 
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
+    ; packing & avg
+    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    ; another implementation for xmm2 high bits
+;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm2
 
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
+    movdqa xmm3, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm3, xmm6
+;   psubb xmm3, xmm1
+;   psrlw xmm3, 8
+    pavgb xmm1, xmm3
 
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
+    pavgb xmm0, xmm1
+    packuswb xmm0, xmm1
 
-	; write pDst
-	movq [edi], xmm0
+    ; write pDst
+    movq [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
+    ; next SMB
+    lea esi, [esi+16]
+    lea edi, [edi+8]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx32_sse4(    unsigned char* pDst, const int iDstStride,
+;                   unsigned char* pSrc, const int iSrcStride,
+;                   const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01			; iSrcHeight >> 1
+    sar ebp, $01            ; iSrcHeight >> 1
 
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $04			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01            ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $04            ; (iSrcWidth >> 1) / 16     ; loop count = num_of_mb
+    neg ebx             ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 32 bytes
 .xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
+    ; 1st part horizonal loop: x16 bytes
+    ;               mem  hi<-       ->lo
+    ;1st Line Src:  xmm0: h H g G f F e E d D c C b B a A
+    ;               xmm1: p P o O n N m M l L k K j J i I
+    ;2nd Line Src:  xmm2: h H g G f F e E d D c C b B a A
+    ;               xmm3: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: P O N M L K J I H G F E D C B A
+    ;: p o n m l k j i h g f e d c b a
+    ;: P ..                          A
+    ;: p ..                          a
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movntdqa xmm0, [esi]            ; 1st_src_line
+    movntdqa xmm1, [esi+16]     ; 1st_src_line + 16
+    movntdqa xmm2, [esi+ecx]        ; 2nd_src_line
+    movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
 
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
+    ; packing & avg
+    movdqa xmm4, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm4, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;   psubb xmm4, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm4, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm4
 
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
+    movdqa xmm5, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm1
+;   psrlw xmm5, 8
+    pavgb xmm1, xmm5
 
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
+    movdqa xmm4, xmm2
+    pshufb xmm2, xmm7
+    pshufb xmm4, xmm6
+;   psubb xmm4, xmm2
+;   psrlw xmm4, 8
+    pavgb xmm2, xmm4
 
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
+    movdqa xmm5, xmm3
+    pshufb xmm3, xmm7
+    pshufb xmm5, xmm6
+;   psubb xmm5, xmm3
+;   psrlw xmm5, 8
+    pavgb xmm3, xmm5
 
-	packuswb xmm0, xmm1
-	packuswb xmm2, xmm3
-	pavgb xmm0, xmm2
+    packuswb xmm0, xmm1
+    packuswb xmm2, xmm3
+    pavgb xmm0, xmm2
 
-	; write pDst
-	movdqa [edi], xmm0
+    ; write pDst
+    movdqa [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
+    ; next SMB
+    lea esi, [esi+32]
+    lea edi, [edi+16]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 ;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
+;   void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;                     unsigned char* pSrc, const int iSrcStride,
+;                     const int iSrcWidth, const int iSrcHeight );
 ;***********************************************************************
 WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
+    push ebx
+    push edx
+    push esi
+    push edi
+    push ebp
 
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride
-	mov ebp, [esp+44]	; iSrcHeight
+    mov edi, [esp+24]   ; pDst
+    mov edx, [esp+28]   ; iDstStride
+    mov esi, [esp+32]   ; pSrc
+    mov ecx, [esp+36]   ; iSrcStride
+    mov ebp, [esp+44]   ; iSrcHeight
 
-	sar ebp, $01		; iSrcHeight >> 1
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
+    sar ebp, $01        ; iSrcHeight >> 1
+    movdqa xmm7, [shufb_mask_low]   ; mask low
+    movdqa xmm6, [shufb_mask_high]  ; mask high
 
 .yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $01		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $03		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
+    mov eax, [esp+40]   ; iSrcWidth
+    sar eax, $01        ; iSrcWidth >> 1
+    mov ebx, eax        ; iDstWidth restored at ebx
+    sar eax, $03        ; (iSrcWidth >> 1) / 8      ; loop count = num_of_mb
+    neg ebx         ; - (iSrcWidth >> 1)
+    ; each loop = source bandwidth: 16 bytes
 .xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
+    ; horizonal loop: x16 bytes by source
+    ;               mem  hi<-       ->lo
+    ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+    ;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+    ;=> target:
+    ;: H G F E D C B A, P O N M L K J I
+    ;: h g f e d c b a, p o n m l k j i
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+    ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    movntdqa xmm0, [esi]            ; 1st_src_line
+    movntdqa xmm1, [esi+ecx]        ; 2nd_src_line
 
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
+    ; packing & avg
+    movdqa xmm2, xmm0           ; h H g G f F e E d D c C b B a A
+    pshufb xmm0, xmm7           ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+    pshufb xmm2, xmm6           ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;   psubb xmm2, xmm0            ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;   psrlw xmm2, 8               ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+    pavgb xmm0, xmm2
 
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
+    movdqa xmm3, xmm1
+    pshufb xmm1, xmm7
+    pshufb xmm3, xmm6
+;   psubb xmm3, xmm1
+;   psrlw xmm3, 8
+    pavgb xmm1, xmm3
 
-	pavgb xmm0, xmm1
-	packuswb xmm0, xmm1
+    pavgb xmm0, xmm1
+    packuswb xmm0, xmm1
 
-	; write pDst
-	movq [edi], xmm0
+    ; write pDst
+    movq [edi], xmm0
 
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
+    ; next SMB
+    lea esi, [esi+16]
+    lea edi, [edi+8]
 
-	dec eax
-	jg near .xloops
+    dec eax
+    jg near .xloops
 
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+    ; next line
+    lea esi, [esi+2*ecx]    ; next end of lines
+    lea esi, [esi+2*ebx]    ; reset to base 0 [- 2 * iDstWidth]
+    lea edi, [edi+edx]
+    lea edi, [edi+ebx]      ; reset to base 0 [- iDstWidth]
 
-	dec ebp
-	jg near .yloops
+    dec ebp
+    jg near .yloops
 
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
+    pop ebp
+    pop edi
+    pop esi
+    pop edx
+    pop ebx
+    ret
 
 
 
@@ -811,202 +811,202 @@
 
 ;**************************************************************************************************************
 ;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
 ;                           unsigned int uiScaleX, unsigned int uiScaleY );
 ;{
 ;**************************************************************************************************************
 
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+%define     pushsize    16
+%define     localsize   28
+%define     pDstData        esp + pushsize + localsize + 4
+%define     dwDstStride     esp + pushsize + localsize + 8
+%define     dwDstWidth      esp + pushsize + localsize + 12
+%define     dwDstHeight     esp + pushsize + localsize + 16
+%define     pSrcData        esp + pushsize + localsize + 20
+%define     dwSrcStride     esp + pushsize + localsize + 24
+%define     dwSrcWidth      esp + pushsize + localsize + 28
+%define     dwSrcHeight     esp + pushsize + localsize + 32
+%define     scale           esp + 0
+%define     uiScaleX            esp + pushsize + localsize + 36
+%define     uiScaleY            esp + pushsize + localsize + 40
+%define     tmpHeight       esp + 12
+%define     yInverse        esp + 16
+%define     xInverse        esp + 20
+%define     dstStep         esp + 24
+    sub     esp,            localsize
 
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+    pxor    xmm0,   xmm0
+    mov     edx,    32767
+    mov     eax,    [uiScaleX]
+    and     eax,    32767
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    32767
+    movd    xmm1,       eax                     ; uinc(uiScaleX mod 32767)
+    movd    xmm2,       ebx                     ; -uinc
+    psllq   xmm1,       32
+    por     xmm1,       xmm2                    ; 0 0  uinc  -uinc   (dword)
+    pshufd  xmm7,       xmm1,   01000100b       ; xmm7: uinc -uinc uinc -uinc
 
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+    mov     eax,    [uiScaleY]
+    and     eax,    32767
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    32767
+    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       ebx                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 0 vinc -vinc (dword)
+    pshufd  xmm6,       xmm6,   01010000b       ; xmm6: vinc vinc -vinc -vinc
 
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+    mov     edx,        40003fffh
+    movd    xmm5,       edx
+    punpcklwd   xmm5,   xmm0                    ; 16384 16383
+    pshufd  xmm5,       xmm5,   01000100b       ; xmm5: 16384 16383 16384 16383
 
 
 DOWNSAMPLE:
 
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
+    mov     eax,            [dwDstHeight]
+    mov     edi,            [pDstData]
+    mov     edx,            [dwDstStride]
+    mov     ecx,            [dwDstWidth]
+    sub     edx,            ecx
+    mov     [dstStep],  edx             ; stride - width
+    dec     eax
+    mov     [tmpHeight],    eax
+    mov     eax,            16384
+    mov     [yInverse],     eax
 
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+    pshufd  xmm4,       xmm5,   01010000b   ; initial v to 16384 16384 16383 16383
 
 HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
+    mov     ebp,    esi
+    add     ebp,    [dwSrcStride]
 
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
+    mov     eax,        16384
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
+    dec     ecx
 
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+    movdqa  xmm3,       xmm5            ; initial u to 16384 16383 16384 16383
 
 WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
+    mov     eax,        [xInverse]
+    shr     eax,        15
 
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+    movd    xmm1,       [esi+eax]       ; xxxxxxba
+    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
+    pxor    xmm0,       xmm0
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
+    punpcklwd   xmm1,   xmm0            ; 000d000c000b000a
 
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmaddwd xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    movdqa  xmm0,   xmm2
+    pmuludq xmm2,   xmm1
+    psrlq   xmm0,   32
+    psrlq   xmm1,   32
+    pmuludq xmm0,   xmm1
+    paddq   xmm2,   xmm0
+    pshufd  xmm1,   xmm2,   00001110b
+    paddq   xmm2,   xmm1
+    psrlq   xmm2,   29
 
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
+    movd    eax,    xmm2
+    inc     eax
+    shr     eax,    1
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
+    paddw   xmm3,       xmm7            ; inc u
+    psllw   xmm3,       1
+    psrlw   xmm3,       1
 
-	loop	WIDTH
+    loop    WIDTH
 
 WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
+    mov     eax,        [xInverse]
+    shr     eax,        15
+    mov     cl,         [esi+eax]
+    mov     [edi],      cl
+    inc     edi
 
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
+    mov     eax,        [uiScaleY]
+    add     [yInverse], eax
+    add     edi,        [dstStep]
 
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
 
-	dec		dword [tmpHeight]
-	jg		HEIGHT
+    dec     dword [tmpHeight]
+    jg      HEIGHT
 
 
 LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
 
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
+    mov     eax,        16384
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
 
 LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
+    mov     eax,        [xInverse]
+    shr     eax,        15
 
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
+    mov     al,         [esi+eax]
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	loop	LAST_ROW_WIDTH
+    loop    LAST_ROW_WIDTH
 
 LAST_ROW_END:
 
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
+    add     esp,            localsize
+    pop     ebx
+    pop     edi
+    pop     esi
+    pop     ebp
+%undef      pushsize
+%undef      localsize
+%undef      pSrcData
+%undef      dwSrcWidth
+%undef      dwSrcHeight
+%undef      dwSrcStride
+%undef      pDstData
+%undef      dwDstWidth
+%undef      dwDstHeight
+%undef      dwDstStride
+%undef      scale
+%undef      uiScaleX
+%undef      uiScaleY
+%undef      tmpHeight
+%undef      yInverse
+%undef      xInverse
+%undef      dstStep
+    ret
 
 
 
@@ -1013,193 +1013,193 @@
 
 ;**************************************************************************************************************
 ;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
 ;               unsigned int uiScaleX, unsigned int uiScaleY );
 ;{
 ;**************************************************************************************************************
 
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+%define     pushsize    16
+%define     localsize   28
+%define     pDstData        esp + pushsize + localsize + 4
+%define     dwDstStride     esp + pushsize + localsize + 8
+%define     dwDstWidth      esp + pushsize + localsize + 12
+%define     dwDstHeight     esp + pushsize + localsize + 16
+%define     pSrcData        esp + pushsize + localsize + 20
+%define     dwSrcStride     esp + pushsize + localsize + 24
+%define     dwSrcWidth      esp + pushsize + localsize + 28
+%define     dwSrcHeight     esp + pushsize + localsize + 32
+%define     scale           esp + 0
+%define     uiScaleX            esp + pushsize + localsize + 36
+%define     uiScaleY            esp + pushsize + localsize + 40
+%define     tmpHeight       esp + 12
+%define     yInverse        esp + 16
+%define     xInverse        esp + 20
+%define     dstStep         esp + 24
+    sub     esp,            localsize
 
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+    pxor    xmm0,   xmm0
+    mov     edx,    65535
+    mov     eax,    [uiScaleX]
+    and     eax,    edx
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    65535
+    movd    xmm1,       eax                     ; uinc(uiScaleX mod 65536)
+    movd    xmm2,       ebx                     ; -uinc
+    psllq   xmm1,       32
+    por     xmm1,       xmm2                    ; 0 uinc 0 -uinc
+    pshuflw xmm7,       xmm1,   10001000b       ; xmm7: uinc -uinc uinc -uinc
 
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+    mov     eax,    [uiScaleY]
+    and     eax,    32767
+    mov     ebx,    eax
+    neg     ebx
+    and     ebx,    32767
+    movd    xmm6,       eax                     ; vinc(uiScaleY mod 32767)
+    movd    xmm2,       ebx                     ; -vinc
+    psllq   xmm6,       32
+    por     xmm6,       xmm2                    ; 0 vinc 0 -vinc
+    pshuflw xmm6,       xmm6,   10100000b       ; xmm6: vinc vinc -vinc -vinc
 
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
+    mov     edx,        80007fffh               ; 32768 32767
+    movd    xmm5,       edx
+    pshuflw xmm5,       xmm5,       01000100b   ; 32768 32767 32768 32767
+    mov     ebx,        16384
 
 
 FAST_DOWNSAMPLE:
 
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
+    mov     eax,            [dwDstHeight]
+    mov     edi,            [pDstData]
+    mov     edx,            [dwDstStride]
+    mov     ecx,            [dwDstWidth]
+    sub     edx,            ecx
+    mov     [dstStep],  edx             ; stride - width
+    dec     eax
+    mov     [tmpHeight],    eax
+    mov     eax,        16384
+    mov     [yInverse],     eax
 
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+    pshuflw xmm4,       xmm5,   01010000b
+    psrlw   xmm4,       1               ; initial v to 16384 16384 16383 16383
 
 FAST_HEIGHT:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
+    mov     ebp,    esi
+    add     ebp,    [dwSrcStride]
 
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
+    mov     eax,        32768
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
+    dec     ecx
 
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+    movdqa  xmm3,       xmm5            ; initial u to 32768 32767 32768 32767
 
 FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
+    mov     eax,        [xInverse]
+    shr     eax,        16
 
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+    movd    xmm1,       [esi+eax]       ; xxxxxxba
+    movd    xmm2,       [ebp+eax]       ; xxxxxxdc
+    punpcklwd   xmm1,   xmm2            ; xxxxdcba
+    punpcklbw   xmm1,   xmm0            ; 0d0c0b0a
 
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
+    movdqa  xmm2,   xmm4    ; xmm2:  vv(1-v)(1-v)  tmpv
+    pmulhuw xmm2,   xmm3    ; mul u(1-u)u(1-u) on xmm2
+    pmaddwd     xmm2,   xmm1
+    pshufd  xmm1,   xmm2,   00000001b
+    paddd   xmm2,   xmm1
+    movd    xmm1,   ebx
+    paddd   xmm2,   xmm1
+    psrld   xmm2,   15
 
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
+    packuswb    xmm2,   xmm0
+    movd    eax,    xmm2
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	paddw	xmm3,		xmm7			; inc u
+    paddw   xmm3,       xmm7            ; inc u
 
-	loop	FAST_WIDTH
+    loop    FAST_WIDTH
 
 FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
+    mov     eax,        [xInverse]
+    shr     eax,        16
+    mov     cl,         [esi+eax]
+    mov     [edi],      cl
+    inc     edi
 
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
+    mov     eax,        [uiScaleY]
+    add     [yInverse], eax
+    add     edi,        [dstStep]
 
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
+    paddw   xmm4,   xmm6                ; inc v
+    psllw   xmm4,   1
+    psrlw   xmm4,   1
 
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
+    dec     dword [tmpHeight]
+    jg      FAST_HEIGHT
 
 
 FAST_LAST_ROW:
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
+    mov     eax,    [yInverse]
+    mov     esi,    [pSrcData]
+    shr     eax,    15
+    mul     dword [dwSrcStride]
+    add     esi,    eax                 ; get current row address
 
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
+    mov     eax,        32768
+    mov     [xInverse],     eax
+    mov     ecx,            [dwDstWidth]
 
 FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
+    mov     eax,        [xInverse]
+    shr     eax,        16
 
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
+    mov     al,         [esi+eax]
+    mov     [edi],  al
+    inc     edi
 
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
+    mov     eax,        [uiScaleX]
+    add     [xInverse], eax
 
-	loop	FAST_LAST_ROW_WIDTH
+    loop    FAST_LAST_ROW_WIDTH
 
 FAST_LAST_ROW_END:
 
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
+    add     esp,            localsize
+    pop     ebx
+    pop     edi
+    pop     esi
+    pop     ebp
+%undef      pushsize
+%undef      localsize
+%undef      pSrcData
+%undef      dwSrcWidth
+%undef      dwSrcHeight
+%undef      dwSrcStride
+%undef      pDstData
+%undef      dwDstWidth
+%undef      dwDstHeight
+%undef      dwDstStride
+%undef      scale
+%undef      uiScaleX
+%undef      uiScaleY
+%undef      tmpHeight
+%undef      yInverse
+%undef      xInverse
+%undef      dstStep
+    ret
 %endif
--- a/codec/processing/src/x86/vaa.asm
+++ b/codec/processing/src/x86/vaa.asm
@@ -48,192 +48,192 @@
 ; Macros and other preprocessor constants
 ;***********************************************************************
 %macro SUM_SQR_SSE2     3       ; dst, pSrc, zero
-  movdqa %1, %2
-  punpcklbw %1, %3
-  punpckhbw %2, %3
-  pmaddwd %1, %1
-  pmaddwd %2, %2
-  paddd %1, %2
-  pshufd %2, %1, 04Eh   ; 01001110 B
-  paddd %1, %2
-  pshufd %2, %1, 0B1h   ; 10110001 B
-  paddd %1, %2
+    movdqa %1, %2
+    punpcklbw %1, %3
+    punpckhbw %2, %3
+    pmaddwd %1, %1
+    pmaddwd %2, %2
+    paddd %1, %2
+    pshufd %2, %1, 04Eh   ; 01001110 B
+    paddd %1, %2
+    pshufd %2, %1, 0B1h   ; 10110001 B
+    paddd %1, %2
 %endmacro       ; END OF SUM_SQR_SSE2
 
 %macro WELS_SAD_16x2_SSE2  3 ;esi :%1 edi:%2 ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   [%1+%3]
-  movdqa        xmm4,   [%2+%3]
-  psadbw        xmm1,   xmm2
-  psadbw        xmm3,   xmm4
-  paddd xmm6,   xmm1
-  paddd xmm6,   xmm3
-  lea           %1,     [%1+%3*2]
-  lea           %2,     [%2+%3*2]
+    movdqa        xmm1,   [%1]
+    movdqa        xmm2,   [%2]
+    movdqa        xmm3,   [%1+%3]
+    movdqa        xmm4,   [%2+%3]
+    psadbw        xmm1,   xmm2
+    psadbw        xmm3,   xmm4
+    paddd xmm6,   xmm1
+    paddd xmm6,   xmm3
+    lea           %1,     [%1+%3*2]
+    lea           %2,     [%2+%3*2]
 %endmacro
 
 ; by comparing it outperforms than phaddw(SSSE3) sets
 %macro SUM_WORD_8x2_SSE2        2       ; dst(pSrc), tmp
-  ; @sum_8x2 begin
-  pshufd %2, %1, 04Eh   ; 01001110 B
-  paddw %1, %2
-  pshuflw %2, %1, 04Eh  ; 01001110 B
-  paddw %1, %2
-  pshuflw %2, %1, 0B1h  ; 10110001 B
-  paddw %1, %2
-  ; end of @sum_8x2
+    ; @sum_8x2 begin
+    pshufd %2, %1, 04Eh   ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 04Eh  ; 01001110 B
+    paddw %1, %2
+    pshuflw %2, %1, 0B1h  ; 10110001 B
+    paddw %1, %2
+    ; end of @sum_8x2
 %endmacro       ; END of SUM_WORD_8x2_SSE2
 
-%macro  WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm2
-  paddd         xmm6,   xmm3
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
+    movdqa        xmm1,   [%1]
+    movdqa        xmm2,   [%2]
+    movdqa        xmm3,   xmm1
+    psadbw        xmm3,   xmm2
+    paddd         xmm6,   xmm3
 
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm0
-  paddd         xmm5,   xmm3
+    movdqa        xmm3,   xmm1
+    psadbw        xmm3,   xmm0
+    paddd         xmm5,   xmm3
 
-  movdqa        xmm2,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm2,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm2,   xmm2
-  paddd         xmm4,   xmm1
-  paddd         xmm4,   xmm2
+    movdqa        xmm2,   xmm1
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm2,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm2,   xmm2
+    paddd         xmm4,   xmm1
+    paddd         xmm4,   xmm2
 
-  add           %1,     %3
-  add           %2,     %3
+    add           %1,     %3
+    add           %2,     %3
 %endmacro
 
-%macro  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
-  movdqa        xmm1,   [%1]
-  movdqa        xmm2,   [%2]
-  movdqa        xmm3,   xmm1
-  psadbw        xmm3,   xmm2
-  paddd         xmm7,   xmm3    ; sad
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
+    movdqa        xmm1,   [%1]
+    movdqa        xmm2,   [%2]
+    movdqa        xmm3,   xmm1
+    psadbw        xmm3,   xmm2
+    paddd         xmm7,   xmm3    ; sad
 
-  movdqa        xmm3,   xmm1
-  pmaxub        xmm3,   xmm2
-  pminub        xmm2,   xmm1
-  psubb xmm3,   xmm2    ; diff
+    movdqa        xmm3,   xmm1
+    pmaxub        xmm3,   xmm2
+    pminub        xmm2,   xmm1
+    psubb xmm3,   xmm2    ; diff
 
-  movdqa        xmm2,   xmm1
-  psadbw        xmm2,   xmm0
-  paddd xmm6,   xmm2    ; sum
+    movdqa        xmm2,   xmm1
+    psadbw        xmm2,   xmm0
+    paddd xmm6,   xmm2    ; sum
 
-  movdqa                xmm2,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm2,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm2,   xmm2
-  paddd         xmm5,   xmm1
-  paddd         xmm5,   xmm2    ; sqsum
+    movdqa                xmm2,   xmm1
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm2,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm2,   xmm2
+    paddd         xmm5,   xmm1
+    paddd         xmm5,   xmm2    ; sqsum
 
-  movdqa                xmm1,   xmm3
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm3,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm3,   xmm3
-  paddd         xmm4,   xmm1
-  paddd         xmm4,   xmm3    ; sqdiff
+    movdqa                xmm1,   xmm3
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm3,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm3,   xmm3
+    paddd         xmm4,   xmm1
+    paddd         xmm4,   xmm3    ; sqdiff
 
-  add           %1,     %3
-  add           %2,     %3
+    add           %1,     %3
+    add           %2,     %3
 %endmacro
 
-%macro  WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
+%macro WELS_SAD_SD_MAD_16x1_SSE2       7 ;esi:%5 edi:%6 ebx:%7
 %define sad_reg                 %1
 %define sum_cur_reg             %2
 %define sum_ref_reg             %3
 %define mad_reg                 %4
-  movdqa        xmm1,           [%5]
-  movdqa        xmm2,           [%6]
-  movdqa        xmm3,           xmm1
-  psadbw        xmm3,           xmm0
-  paddd         sum_cur_reg,    xmm3    ; sum_cur
-  movdqa        xmm3,           xmm2
-  psadbw        xmm3,           xmm0
-  paddd sum_ref_reg,                    xmm3    ; sum_ref
+    movdqa        xmm1,           [%5]
+    movdqa        xmm2,           [%6]
+    movdqa        xmm3,           xmm1
+    psadbw        xmm3,           xmm0
+    paddd         sum_cur_reg,    xmm3    ; sum_cur
+    movdqa        xmm3,           xmm2
+    psadbw        xmm3,           xmm0
+    paddd sum_ref_reg,                    xmm3    ; sum_ref
 
-  movdqa        xmm3,           xmm1
-  pmaxub        xmm3,           xmm2
-  pminub        xmm2,           xmm1
-  psubb xmm3,           xmm2    ; abs diff
-  pmaxub        mad_reg,        xmm3    ; max abs diff
+    movdqa        xmm3,           xmm1
+    pmaxub        xmm3,           xmm2
+    pminub        xmm2,           xmm1
+    psubb xmm3,           xmm2    ; abs diff
+    pmaxub        mad_reg,        xmm3    ; max abs diff
 
-  psadbw        xmm3,           xmm0
-  paddd sad_reg,        xmm3    ; sad
+    psadbw        xmm3,           xmm0
+    paddd sad_reg,        xmm3    ; sad
 
-  add                   %5,             %7
-  add                   %6,             %7
+    add                   %5,             %7
+    add                   %6,             %7
 %endmacro
 
 
-%macro  WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
+%macro WELS_MAX_REG_SSE2       1       ; xmm1, xmm2, xmm3 can be used
 %define max_reg  %1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           4
-  pmaxub        max_reg,        xmm1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           2
-  pmaxub        max_reg,        xmm1
-  movdqa        xmm1,           max_reg
-  psrldq        xmm1,           1
-  pmaxub        max_reg,        xmm1
+    movdqa        xmm1,           max_reg
+    psrldq        xmm1,           4
+    pmaxub        max_reg,        xmm1
+    movdqa        xmm1,           max_reg
+    psrldq        xmm1,           2
+    pmaxub        max_reg,        xmm1
+    movdqa        xmm1,           max_reg
+    psrldq        xmm1,           1
+    pmaxub        max_reg,        xmm1
 %endmacro
 
-%macro  WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2   7 ;esi:%5 edi:%6 ebx:%7
 %define sad_reg         %1
 %define sum_reg         %2
 %define mad_reg         %3
 %define sqdiff_reg      %4
-  movdqa                xmm1,           [%5]
-  movdqa                xmm2,           xmm1
-  movdqa                xmm3,           xmm1
-  punpcklbw     xmm2,           xmm0
-  punpckhbw     xmm3,           xmm0
-  pmaddwd               xmm2,           xmm2
-  pmaddwd               xmm3,           xmm3
-  paddd         xmm2,           xmm3
-  movdqa                xmm3,           xmm2
-  psllq         xmm2,           32
-  psrlq         xmm3,           32
-  psllq         xmm3,           32
-  paddd         xmm2,           xmm3
-  paddd         sad_reg,        xmm2            ; sqsum
+    movdqa                xmm1,           [%5]
+    movdqa                xmm2,           xmm1
+    movdqa                xmm3,           xmm1
+    punpcklbw     xmm2,           xmm0
+    punpckhbw     xmm3,           xmm0
+    pmaddwd               xmm2,           xmm2
+    pmaddwd               xmm3,           xmm3
+    paddd         xmm2,           xmm3
+    movdqa                xmm3,           xmm2
+    psllq         xmm2,           32
+    psrlq         xmm3,           32
+    psllq         xmm3,           32
+    paddd         xmm2,           xmm3
+    paddd         sad_reg,        xmm2            ; sqsum
 
-  movdqa        xmm2,           [%6]
-  movdqa        xmm3,           xmm1
-  psadbw        xmm3,           xmm0
-  paddd sum_reg,                        xmm3    ; sum_cur
-  movdqa        xmm3,           xmm2
-  psadbw        xmm3,           xmm0
-  pslldq        xmm3,           4
-  paddd sum_reg,                        xmm3    ; sum_ref
+    movdqa        xmm2,           [%6]
+    movdqa        xmm3,           xmm1
+    psadbw        xmm3,           xmm0
+    paddd sum_reg,                        xmm3    ; sum_cur
+    movdqa        xmm3,           xmm2
+    psadbw        xmm3,           xmm0
+    pslldq        xmm3,           4
+    paddd sum_reg,                        xmm3    ; sum_ref
 
-  movdqa        xmm3,           xmm1
-  pmaxub        xmm3,           xmm2
-  pminub        xmm2,           xmm1
-  psubb xmm3,           xmm2    ; abs diff
-  pmaxub        mad_reg,        xmm3    ; max abs diff
+    movdqa        xmm3,           xmm1
+    pmaxub        xmm3,           xmm2
+    pminub        xmm2,           xmm1
+    psubb xmm3,           xmm2    ; abs diff
+    pmaxub        mad_reg,        xmm3    ; max abs diff
 
-  movdqa        xmm1,           xmm3
-  psadbw        xmm3,           xmm0
-  paddd sad_reg,        xmm3    ; sad
+    movdqa        xmm1,           xmm3
+    psadbw        xmm3,           xmm0
+    paddd sad_reg,        xmm3    ; sad
 
-  movdqa                xmm3,   xmm1
-  punpcklbw     xmm1,   xmm0
-  punpckhbw     xmm3,   xmm0
-  pmaddwd               xmm1,   xmm1
-  pmaddwd               xmm3,   xmm3
-  paddd         sqdiff_reg,     xmm1
-  paddd         sqdiff_reg,     xmm3    ; sqdiff
+    movdqa                xmm3,   xmm1
+    punpcklbw     xmm1,   xmm0
+    punpckhbw     xmm3,   xmm0
+    pmaddwd               xmm1,   xmm1
+    pmaddwd               xmm3,   xmm3
+    paddd         sqdiff_reg,     xmm1
+    paddd         sqdiff_reg,     xmm3    ; sqdiff
 
-  add           %5,     %7
-  add           %6,     %7
+    add           %5,     %7
+    add           %6,     %7
 %endmacro
 
 
@@ -249,99 +249,99 @@
 ;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
 WELS_EXTERN SampleVariance16x16_sse2
-  push esi
-  push edi
-  push ebx
+    push esi
+    push edi
+    push ebx
 
-  sub esp, 16
-  %define SUM                   [esp]
-  %define SUM_CUR               [esp+4]
-  %define SQR                   [esp+8]
-  %define SQR_CUR               [esp+12]
-  %define PUSH_SIZE     28      ; 12 + 16
+    sub esp, 16
+    %define SUM                   [esp]
+    %define SUM_CUR               [esp+4]
+    %define SQR                   [esp+8]
+    %define SQR_CUR               [esp+12]
+    %define PUSH_SIZE     28      ; 12 + 16
 
-  mov edi, [esp+PUSH_SIZE+4]    ; y_ref
-  mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
-  mov esi, [esp+PUSH_SIZE+12]   ; y_src
-  mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
-  mov ecx, 010h                         ; height = 16
+    mov edi, [esp+PUSH_SIZE+4]    ; y_ref
+    mov edx, [esp+PUSH_SIZE+8]    ; y_ref_stride
+    mov esi, [esp+PUSH_SIZE+12]   ; y_src
+    mov eax, [esp+PUSH_SIZE+16]   ; y_src_stride
+    mov ecx, 010h                         ; height = 16
 
-  pxor xmm7, xmm7
-  movdqu SUM, xmm7
+    pxor xmm7, xmm7
+    movdqu SUM, xmm7
 
 .hloops:
-  movdqa xmm0, [edi]            ; y_ref
-  movdqa xmm1, [esi]            ; y_src
-  movdqa xmm2, xmm0             ; store first for future process
-  movdqa xmm3, xmm1
-  ; sum += diff;
-  movdqa xmm4, xmm0
-  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
-  ; to be continued for sum
-  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
-  paddw xmm4, xmm5
-  movd ebx, xmm4
-  add SUM, ebx
+    movdqa xmm0, [edi]            ; y_ref
+    movdqa xmm1, [esi]            ; y_src
+    movdqa xmm2, xmm0             ; store first for future process
+    movdqa xmm3, xmm1
+    ; sum += diff;
+    movdqa xmm4, xmm0
+    psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+    ; to be continued for sum
+    pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+    paddw xmm4, xmm5
+    movd ebx, xmm4
+    add SUM, ebx
 
-  ; sqr += diff * diff;
-  pmaxub xmm0, xmm1
-  pminub xmm1, xmm2
-  psubb xmm0, xmm1                              ; diff
-  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
-  movd ebx, xmm1
-  add SQR, ebx
+    ; sqr += diff * diff;
+    pmaxub xmm0, xmm1
+    pminub xmm1, xmm2
+    psubb xmm0, xmm1                              ; diff
+    SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+    movd ebx, xmm1
+    add SQR, ebx
 
-  ; sum_cur += y_src[x];
-  movdqa xmm0, xmm3             ; cur_orig
-  movdqa xmm1, xmm0
-  punpcklbw xmm0, xmm7
-  punpckhbw xmm1, xmm7
-  paddw xmm0, xmm1              ; 8x2
-  SUM_WORD_8x2_SSE2 xmm0, xmm1
-  movd ebx, xmm0
-  and ebx, 0ffffh
-  add SUM_CUR, ebx
+    ; sum_cur += y_src[x];
+    movdqa xmm0, xmm3             ; cur_orig
+    movdqa xmm1, xmm0
+    punpcklbw xmm0, xmm7
+    punpckhbw xmm1, xmm7
+    paddw xmm0, xmm1              ; 8x2
+    SUM_WORD_8x2_SSE2 xmm0, xmm1
+    movd ebx, xmm0
+    and ebx, 0ffffh
+    add SUM_CUR, ebx
 
-  ; sqr_cur += y_src[x] * y_src[x];
-  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
-  movd ebx, xmm0
-  add SQR_CUR, ebx
+    ; sqr_cur += y_src[x] * y_src[x];
+    SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+    movd ebx, xmm0
+    add SQR_CUR, ebx
 
-  lea edi, [edi+edx]
-  lea esi, [esi+eax]
-  dec ecx
-  jnz near .hloops
+    lea edi, [edi+edx]
+    lea esi, [esi+eax]
+    dec ecx
+    jnz near .hloops
 
-  mov ebx, 0
-  mov bx, word SUM
-  sar ebx, 8
-  imul ebx, ebx
-  mov ecx, SQR
-  sar ecx, 8
-  sub ecx, ebx
-  mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
-  mov [edi], cx                         ; to store uiMotionIndex
-  mov ebx, 0
-  mov bx, word SUM_CUR
-  sar ebx, 8
-  imul ebx, ebx
-  mov ecx, SQR_CUR
-  sar ecx, 8
-  sub ecx, ebx
-  mov [edi+2], cx                               ; to store uiTextureIndex
+    mov ebx, 0
+    mov bx, word SUM
+    sar ebx, 8
+    imul ebx, ebx
+    mov ecx, SQR
+    sar ecx, 8
+    sub ecx, ebx
+    mov edi, [esp+PUSH_SIZE+20]   ; pMotionTexture
+    mov [edi], cx                         ; to store uiMotionIndex
+    mov ebx, 0
+    mov bx, word SUM_CUR
+    sar ebx, 8
+    imul ebx, ebx
+    mov ecx, SQR_CUR
+    sar ecx, 8
+    sub ecx, ebx
+    mov [edi+2], cx                               ; to store uiTextureIndex
 
-  %undef SUM
-  %undef SUM_CUR
-  %undef SQR
-  %undef SQR_CUR
-  %undef PUSH_SIZE
+    %undef SUM
+    %undef SUM_CUR
+    %undef SQR
+    %undef SQR_CUR
+    %undef PUSH_SIZE
 
-  add esp, 16
-  pop ebx
-  pop edi
-  pop esi
+    add esp, 16
+    pop ebx
+    pop edi
+    pop esi
 
-  ret
+    ret
 
 
 
@@ -360,67 +360,67 @@
 %define         psadframe                       esp + pushsize + 24
 %define         psad8x8                         esp + pushsize + 28
 %define         pushsize        12
-  push  esi
-  push  edi
-  push  ebx
-  mov           esi,    [cur_data]
-  mov           edi,    [ref_data]
-  mov           ebx,    [iPicStride]
-  mov           edx,    [psad8x8]
-  mov           eax,    ebx
+    push  esi
+    push  edi
+    push  ebx
+    mov           esi,    [cur_data]
+    mov           edi,    [ref_data]
+    mov           ebx,    [iPicStride]
+    mov           edx,    [psad8x8]
+    mov           eax,    ebx
 
-  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl           eax,    4                                                               ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl           eax,    4                                                               ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 height_loop:
-  mov           ecx,    dword [iPicWidth]
-  push  esi
-  push  edi
+    mov           ecx,    dword [iPicWidth]
+    push  esi
+    push  edi
 width_loop:
-  pxor  xmm6,   xmm6            ;
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx],          xmm6
-  psrldq        xmm6,           8
-  movd  [edx+4],        xmm6
+    pxor  xmm6,   xmm6            ;
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx],          xmm6
+    psrldq        xmm6,           8
+    movd  [edx+4],        xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  WELS_SAD_16x2_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx+8],        xmm6
-  psrldq        xmm6,           8
-  movd  [edx+12],       xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    WELS_SAD_16x2_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx+8],        xmm6
+    psrldq        xmm6,           8
+    movd  [edx+12],       xmm6
 
-  add           edx,    16
-  sub           esi,    eax
-  sub           edi,    eax
-  add           esi,    16
-  add           edi,    16
+    add           edx,    16
+    sub           esi,    eax
+    sub           edi,    eax
+    add           esi,    16
+    add           edi,    16
 
-  dec           ecx
-  jnz           width_loop
+    dec           ecx
+    jnz           width_loop
 
-  pop           edi
-  pop           esi
-  add           esi,    eax
-  add           edi,    eax
+    pop           edi
+    pop           esi
+    add           esi,    eax
+    add           edi,    eax
 
-  dec   dword [iPicHeight]
-  jnz           height_loop
+    dec   dword [iPicHeight]
+    jnz           height_loop
 
-  mov           edx,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [edx],  xmm7
+    mov           edx,    [psadframe]
+    movdqa        xmm5,   xmm7
+    psrldq        xmm7,   8
+    paddd xmm7,   xmm5
+    movd  [edx],  xmm7
 
 %undef          cur_data
 %undef          ref_data
@@ -430,10 +430,10 @@
 %undef          psadframe
 %undef          psad8x8
 %undef          pushsize
-  pop           ebx
-  pop           edi
-  pop           esi
-  ret
+    pop           ebx
+    pop           edi
+    pop           esi
+    ret
 
 %else  ;64-bit
 
@@ -441,98 +441,98 @@
 ;   void SampleVariance16x16_sse2(      uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
 WELS_EXTERN SampleVariance16x16_sse2
-  %define SUM                   r10;[esp]
-  %define SUM_CUR               r11;[esp+4]
-  %define SQR                   r13;[esp+8]
-  %define SQR_CUR               r15;[esp+12]
+    %define SUM                   r10;[esp]
+    %define SUM_CUR               r11;[esp+4]
+    %define SQR                   r13;[esp+8]
+    %define SQR_CUR               r15;[esp+12]
 
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  LOAD_5_PARA
-  PUSH_XMM 8
-  SIGN_EXTENSION r1,r1d
-  SIGN_EXTENSION r3,r3d
+    push r12
+    push r13
+    push r14
+    push r15
+    %assign push_num 4
+    LOAD_5_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1,r1d
+    SIGN_EXTENSION r3,r3d
 
-  mov r12,010h
-  pxor xmm7, xmm7
-  movq SUM, xmm7
-  movq SUM_CUR,xmm7
-  movq SQR,xmm7
-  movq SQR_CUR,xmm7
+    mov r12,010h
+    pxor xmm7, xmm7
+    movq SUM, xmm7
+    movq SUM_CUR,xmm7
+    movq SQR,xmm7
+    movq SQR_CUR,xmm7
 
 .hloops:
-  mov r14,0
-  movdqa xmm0, [r0]             ; y_ref
-  movdqa xmm1, [r2]             ; y_src
-  movdqa xmm2, xmm0             ; store first for future process
-  movdqa xmm3, xmm1
-  ; sum += diff;
-  movdqa xmm4, xmm0
-  psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
-  ; to be continued for sum
-  pshufd xmm5, xmm4, 0C6h       ; 11000110 B
-  paddw xmm4, xmm5
-  movd r14d, xmm4
-  add SUM, r14
+    mov r14,0
+    movdqa xmm0, [r0]             ; y_ref
+    movdqa xmm1, [r2]             ; y_src
+    movdqa xmm2, xmm0             ; store first for future process
+    movdqa xmm3, xmm1
+    ; sum += diff;
+    movdqa xmm4, xmm0
+    psadbw xmm4, xmm1             ; 2 parts, [0,..,15], [64,..,79]
+    ; to be continued for sum
+    pshufd xmm5, xmm4, 0C6h       ; 11000110 B
+    paddw xmm4, xmm5
+    movd r14d, xmm4
+    add SUM, r14
 
-  ; sqr += diff * diff;
-  pmaxub xmm0, xmm1
-  pminub xmm1, xmm2
-  psubb xmm0, xmm1                              ; diff
-  SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
-  movd r14d, xmm1
-  add SQR, r14
+    ; sqr += diff * diff;
+    pmaxub xmm0, xmm1
+    pminub xmm1, xmm2
+    psubb xmm0, xmm1                              ; diff
+    SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+    movd r14d, xmm1
+    add SQR, r14
 
-  ; sum_cur += y_src[x];
-  movdqa xmm0, xmm3             ; cur_orig
-  movdqa xmm1, xmm0
-  punpcklbw xmm0, xmm7
-  punpckhbw xmm1, xmm7
-  paddw xmm0, xmm1              ; 8x2
-  SUM_WORD_8x2_SSE2 xmm0, xmm1
-  movd r14d, xmm0
-  and r14, 0ffffh
-  add SUM_CUR, r14
+    ; sum_cur += y_src[x];
+    movdqa xmm0, xmm3             ; cur_orig
+    movdqa xmm1, xmm0
+    punpcklbw xmm0, xmm7
+    punpckhbw xmm1, xmm7
+    paddw xmm0, xmm1              ; 8x2
+    SUM_WORD_8x2_SSE2 xmm0, xmm1
+    movd r14d, xmm0
+    and r14, 0ffffh
+    add SUM_CUR, r14
 
-  ; sqr_cur += y_src[x] * y_src[x];
-  SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
-  movd r14d, xmm0
-  add SQR_CUR, r14
+    ; sqr_cur += y_src[x] * y_src[x];
+    SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+    movd r14d, xmm0
+    add SQR_CUR, r14
 
-  lea r0, [r0+r1]
-  lea r2, [r2+r3]
-  dec r12
-  jnz near .hloops
+    lea r0, [r0+r1]
+    lea r2, [r2+r3]
+    dec r12
+    jnz near .hloops
 
-  mov r0, SUM
-  sar r0, 8
-  imul r0, r0
-  mov r1, SQR
-  sar r1, 8
-  sub r1, r0
-  mov [r4], r1w                         ; to store uiMotionIndex
-  mov r0, SUM_CUR
-  sar r0, 8
-  imul r0, r0
-  mov r1, SQR_CUR
-  sar r1, 8
-  sub r1, r0
-  mov [r4+2], r1w                               ; to store uiTextureIndex
+    mov r0, SUM
+    sar r0, 8
+    imul r0, r0
+    mov r1, SQR
+    sar r1, 8
+    sub r1, r0
+    mov [r4], r1w                         ; to store uiMotionIndex
+    mov r0, SUM_CUR
+    sar r0, 8
+    imul r0, r0
+    mov r1, SQR_CUR
+    sar r1, 8
+    sub r1, r0
+    mov [r4+2], r1w                               ; to store uiTextureIndex
 
-  POP_XMM
-  LOAD_5_PARA_POP
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    LOAD_5_PARA_POP
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 
 
-  %assign push_num 0
+    %assign push_num 0
 
-  ret
+    ret
 
 
 ;*************************************************************************************************************
@@ -550,69 +550,69 @@
 %define         psadframe                       r5
 %define         psad8x8                         r6
 
-  push r12
-  push r13
-  %assign push_num 2
-  LOAD_7_PARA
-  PUSH_XMM 8
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    push r12
+    push r13
+    %assign push_num 2
+    LOAD_7_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov   r12,r4
-  shr           r2,     4                                       ; iPicWidth/16
-  shr           r3,     4                                       ; iPicHeight/16
+    mov   r12,r4
+    shr           r2,     4                                       ; iPicWidth/16
+    shr           r3,     4                                       ; iPicHeight/16
 
-  shl           r12,    4                                                               ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shl           r12,    4                                                               ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 height_loop:
-  mov           r13,    r2
-  push  r0
-  push  r1
+    mov           r13,    r2
+    push  r0
+    push  r1
 width_loop:
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r6],           xmm6
-  psrldq        xmm6,           8
-  movd  [r6+4], xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    paddd xmm7,           xmm6
+    movd  [r6],           xmm6
+    psrldq        xmm6,           8
+    movd  [r6+4], xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  WELS_SAD_16x2_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r6+8], xmm6
-  psrldq        xmm6,           8
-  movd  [r6+12],        xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    WELS_SAD_16x2_SSE2 r0,r1,r4
+    paddd xmm7,           xmm6
+    movd  [r6+8], xmm6
+    psrldq        xmm6,           8
+    movd  [r6+12],        xmm6
 
-  add           r6,     16
-  sub           r0,     r12
-  sub           r1,     r12
-  add           r0,     16
-  add           r1,     16
+    add           r6,     16
+    sub           r0,     r12
+    sub           r1,     r12
+    add           r0,     16
+    add           r1,     16
 
-  dec           r13
-  jnz           width_loop
+    dec           r13
+    jnz           width_loop
 
-  pop           r1
-  pop           r0
-  add           r0,     r12
-  add           r1,     r12
+    pop           r1
+    pop           r0
+    add           r0,     r12
+    add           r1,     r12
 
-  dec   r3
-  jnz           height_loop
+    dec   r3
+    jnz           height_loop
 
-  ;mov          r13,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [psadframe],    xmm7
+    ;mov          r13,    [psadframe]
+    movdqa        xmm5,   xmm7
+    psrldq        xmm7,   8
+    paddd xmm7,   xmm5
+    movd  [psadframe],    xmm7
 
 %undef          cur_data
 %undef          ref_data
@@ -622,12 +622,12 @@
 %undef          psadframe
 %undef          psad8x8
 %undef          pushsize
-  POP_XMM
-  LOAD_7_PARA_POP
-  pop r13
-  pop r12
-  %assign push_num 0
-  ret
+    POP_XMM
+    LOAD_7_PARA_POP
+    pop r13
+    pop r12
+    %assign push_num 0
+    ret
 
 %endif
 
@@ -653,103 +653,103 @@
 %define         tmp_esi                         esp + 0
 %define         tmp_edi                         esp + 4
 %define         pushsize                16
-  push  ebp
-  push  esi
-  push  edi
-  push  ebx
-  sub           esp,    localsize
-  mov           esi,    [cur_data]
-  mov           edi,    [ref_data]
-  mov           ebx,    [iPicStride]
-  mov           edx,    [psad8x8]
-  mov           eax,    ebx
+    push  ebp
+    push  esi
+    push  edi
+    push  ebx
+    sub           esp,    localsize
+    mov           esi,    [cur_data]
+    mov           edi,    [ref_data]
+    mov           ebx,    [iPicStride]
+    mov           edx,    [psad8x8]
+    mov           eax,    ebx
 
-  shr           dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr           dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl           eax,    4                                                       ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shr           dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr           dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl           eax,    4                                                       ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 var_height_loop:
-  mov           ecx,    dword [iPicWidth]
-  mov           [tmp_esi],      esi
-  mov           [tmp_edi],      edi
+    mov           ecx,    dword [iPicWidth]
+    mov           [tmp_esi],      esi
+    mov           [tmp_edi],      edi
 var_width_loop:
-  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
-  pxor  xmm5,   xmm5            ; pSum16x16
-  pxor  xmm4,   xmm4            ; sqsum_16x16
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx],          xmm6
-  psrldq        xmm6,           8
-  movd  [edx+4],        xmm6
+    pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+    pxor  xmm5,   xmm5            ; pSum16x16
+    pxor  xmm4,   xmm4            ; sqsum_16x16
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx],          xmm6
+    psrldq        xmm6,           8
+    movd  [edx+4],        xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
-  paddd xmm7,           xmm6
-  movd  [edx+8],        xmm6
-  psrldq        xmm6,           8
-  movd  [edx+12],       xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+    paddd xmm7,           xmm6
+    movd  [edx+8],        xmm6
+    psrldq        xmm6,           8
+    movd  [edx+12],       xmm6
 
-  mov           ebp,    [psum16x16]
-  movdqa        xmm1,   xmm5
-  psrldq        xmm1,   8
-  paddd xmm5,   xmm1
-  movd  [ebp],  xmm5
-  add           dword [psum16x16], 4
+    mov           ebp,    [psum16x16]
+    movdqa        xmm1,   xmm5
+    psrldq        xmm1,   8
+    paddd xmm5,   xmm1
+    movd  [ebp],  xmm5
+    add           dword [psum16x16], 4
 
-  movdqa        xmm5,   xmm4
-  psrldq        xmm5,   8
-  paddd xmm4,   xmm5
-  movdqa        xmm3,   xmm4
-  psrldq        xmm3,   4
-  paddd xmm4,   xmm3
+    movdqa        xmm5,   xmm4
+    psrldq        xmm5,   8
+    paddd xmm4,   xmm5
+    movdqa        xmm3,   xmm4
+    psrldq        xmm3,   4
+    paddd xmm4,   xmm3
 
-  mov           ebp,    [psqsum16x16]
-  movd  [ebp],  xmm4
-  add           dword [psqsum16x16], 4
+    mov           ebp,    [psqsum16x16]
+    movd  [ebp],  xmm4
+    add           dword [psqsum16x16], 4
 
-  add           edx,    16
-  sub           esi,    eax
-  sub           edi,    eax
-  add           esi,    16
-  add           edi,    16
+    add           edx,    16
+    sub           esi,    eax
+    sub           edi,    eax
+    add           esi,    16
+    add           edi,    16
 
-  dec           ecx
-  jnz           var_width_loop
+    dec           ecx
+    jnz           var_width_loop
 
-  mov           esi,    [tmp_esi]
-  mov           edi,    [tmp_edi]
-  add           esi,    eax
-  add           edi,    eax
+    mov           esi,    [tmp_esi]
+    mov           edi,    [tmp_edi]
+    add           esi,    eax
+    add           edi,    eax
 
-  dec   dword [iPicHeight]
-  jnz           var_height_loop
+    dec   dword [iPicHeight]
+    jnz           var_height_loop
 
-  mov           edx,    [psadframe]
-  movdqa        xmm5,   xmm7
-  psrldq        xmm7,   8
-  paddd xmm7,   xmm5
-  movd  [edx],  xmm7
+    mov           edx,    [psadframe]
+    movdqa        xmm5,   xmm7
+    psrldq        xmm7,   8
+    paddd xmm7,   xmm5
+    movd  [edx],  xmm7
 
-  add           esp,    localsize
-  pop           ebx
-  pop           edi
-  pop           esi
-  pop           ebp
+    add           esp,    localsize
+    pop           ebx
+    pop           edi
+    pop           esi
+    pop           ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -763,7 +763,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 %else  ;64-bit
 
@@ -784,112 +784,112 @@
 %define         psum16x16                       arg8
 %define         psqsum16x16                 arg9
 
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  PUSH_XMM 8
+    push r12
+    push r13
+    push r14
+    push r15
+    %assign push_num 4
+    PUSH_XMM 8
 
 %ifdef WIN64
-  mov r4, arg5  ;iPicStride
-  mov r5, arg6  ;psad8x8
+    mov r4, arg5  ;iPicStride
+    mov r5, arg6  ;psad8x8
 %endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    mov r14,arg7
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov   r13,r4
-  shr   r2,4
-  shr   r3,4
+    mov   r13,r4
+    shr   r2,4
+    shr   r3,4
 
-  shl   r13,4   ; iPicStride*16
-  pxor  xmm0,   xmm0
-  pxor  xmm7,   xmm7            ; iFrameSad
+    shl   r13,4   ; iPicStride*16
+    pxor  xmm0,   xmm0
+    pxor  xmm7,   xmm7            ; iFrameSad
 var_height_loop:
-  push    r2
-  %assign push_num push_num+1
-  mov           r11,    r0
-  mov           r12,    r1
+    push    r2
+    %assign push_num push_num+1
+    mov           r11,    r0
+    mov           r12,    r1
 var_width_loop:
-  pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
-  pxor  xmm5,   xmm5            ; pSum16x16
-  pxor  xmm4,   xmm4            ; sqsum_16x16
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  paddd xmm7,           xmm6
-  movd  [r14],          xmm6
-  psrldq        xmm6,           8
-  movd  [r14+4],        xmm6
+    pxor  xmm6,   xmm6            ; hiQuad_loQuad pSad8x8
+    pxor  xmm5,   xmm5            ; pSum16x16
+    pxor  xmm4,   xmm4            ; sqsum_16x16
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    paddd xmm7,           xmm6
+    movd  [r14],          xmm6
+    psrldq        xmm6,           8
+    movd  [r14+4],        xmm6
 
-  pxor  xmm6,   xmm6
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
-  paddd   xmm7,           xmm6
-  movd    [r14+8],        xmm6
-  psrldq  xmm6,           8
-  movd    [r14+12],       xmm6
+    pxor  xmm6,   xmm6
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+    paddd   xmm7,           xmm6
+    movd    [r14+8],        xmm6
+    psrldq  xmm6,           8
+    movd    [r14+12],       xmm6
 
-  mov             r15,    psum16x16
-  movdqa  xmm1,   xmm5
-  psrldq  xmm1,   8
-  paddd   xmm5,   xmm1
-  movd    [r15],  xmm5
-  add             dword psum16x16, 4
+    mov             r15,    psum16x16
+    movdqa  xmm1,   xmm5
+    psrldq  xmm1,   8
+    paddd   xmm5,   xmm1
+    movd    [r15],  xmm5
+    add             dword psum16x16, 4
 
-  movdqa  xmm5,   xmm4
-  psrldq  xmm5,   8
-  paddd   xmm4,   xmm5
-  movdqa  xmm3,   xmm4
-  psrldq  xmm3,   4
-  paddd   xmm4,   xmm3
+    movdqa  xmm5,   xmm4
+    psrldq  xmm5,   8
+    paddd   xmm4,   xmm5
+    movdqa  xmm3,   xmm4
+    psrldq  xmm3,   4
+    paddd   xmm4,   xmm3
 
-  mov             r15,    psqsum16x16
-  movd    [r15],  xmm4
-  add             dword psqsum16x16, 4
+    mov             r15,    psqsum16x16
+    movd    [r15],  xmm4
+    add             dword psqsum16x16, 4
 
-  add             r14,16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
+    add             r14,16
+    sub             r0,     r13
+    sub             r1,     r13
+    add             r0,     16
+    add             r1,     16
 
-  dec             r2
-  jnz             var_width_loop
+    dec             r2
+    jnz             var_width_loop
 
-  pop     r2
-  %assign push_num push_num-1
-  mov             r0,     r11
-  mov             r1,     r12
-  add             r0,     r13
-  add             r1,     r13
-  dec     r3
-  jnz             var_height_loop
+    pop     r2
+    %assign push_num push_num-1
+    mov             r0,     r11
+    mov             r1,     r12
+    add             r0,     r13
+    add             r1,     r13
+    dec     r3
+    jnz             var_height_loop
 
-  mov             r15,    psadframe
-  movdqa  xmm5,   xmm7
-  psrldq  xmm7,   8
-  paddd   xmm7,   xmm5
-  movd    [r15],  xmm7
+    mov             r15,    psadframe
+    movdqa  xmm5,   xmm7
+    psrldq  xmm7,   8
+    paddd   xmm7,   xmm5
+    movd    [r15],  xmm7
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 %assign push_num 0
 %undef          cur_data
 %undef          ref_data
@@ -904,7 +904,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 %endif
 
@@ -932,118 +932,118 @@
 %define         tmp_edi                         esp + 4
 %define         tmp_sadframe            esp + 8
 %define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+    sub             esp,    localsize
 
-  mov             ecx,    [iPicWidth]
-  mov             ecx,    [iPicHeight]
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             edx,    [psad8x8]
-  mov             eax,    ebx
+    mov             ecx,    [iPicWidth]
+    mov             ecx,    [iPicHeight]
+    mov             esi,    [cur_data]
+    mov             edi,    [ref_data]
+    mov             ebx,    [iPicStride]
+    mov             edx,    [psad8x8]
+    mov             eax,    ebx
 
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  mov             ecx,    [iPicWidth]
-  mov             ecx,    [iPicHeight]
-  pxor    xmm0,   xmm0
-  movd    [tmp_sadframe], xmm0
+    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl             eax,    4                                                       ; iPicStride*16
+    mov             ecx,    [iPicWidth]
+    mov             ecx,    [iPicHeight]
+    pxor    xmm0,   xmm0
+    movd    [tmp_sadframe], xmm0
 sqdiff_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
+    mov             ecx,    dword [iPicWidth]
+    mov             [tmp_esi],      esi
+    mov             [tmp_edi],      edi
 sqdiff_width_loop:
-  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
-  pxor    xmm6,   xmm6            ; pSum16x16
-  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  movdqa  xmm1,           xmm7
-  movd    [edx],          xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [edx+4],        xmm7
-  movd    ebp,            xmm1
-  add             [tmp_sadframe], ebp
+    pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+    pxor    xmm6,   xmm6            ; pSum16x16
+    pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    movdqa  xmm1,           xmm7
+    movd    [edx],          xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [edx+4],        xmm7
+    movd    ebp,            xmm1
+    add             [tmp_sadframe], ebp
 
-  pxor    xmm7,   xmm7
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
-  movdqa  xmm1,           xmm7
-  movd    [edx+8],        xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [edx+12],       xmm7
-  movd    ebp,            xmm1
-  add             [tmp_sadframe], ebp
+    pxor    xmm7,   xmm7
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+    movdqa  xmm1,           xmm7
+    movd    [edx+8],        xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [edx+12],       xmm7
+    movd    ebp,            xmm1
+    add             [tmp_sadframe], ebp
 
-  mov             ebp,    [psum16x16]
-  movdqa  xmm1,   xmm6
-  psrldq  xmm1,   8
-  paddd   xmm6,   xmm1
-  movd    [ebp],  xmm6
-  add             dword [psum16x16], 4
+    mov             ebp,    [psum16x16]
+    movdqa  xmm1,   xmm6
+    psrldq  xmm1,   8
+    paddd   xmm6,   xmm1
+    movd    [ebp],  xmm6
+    add             dword [psum16x16], 4
 
-  mov             ebp,    [psqsum16x16]
-  pshufd  xmm6,   xmm5,   14 ;00001110
-  paddd   xmm6,   xmm5
-  pshufd  xmm5,   xmm6,   1  ;00000001
-  paddd   xmm5,   xmm6
-  movd    [ebp],  xmm5
-  add             dword [psqsum16x16], 4
+    mov             ebp,    [psqsum16x16]
+    pshufd  xmm6,   xmm5,   14 ;00001110
+    paddd   xmm6,   xmm5
+    pshufd  xmm5,   xmm6,   1  ;00000001
+    paddd   xmm5,   xmm6
+    movd    [ebp],  xmm5
+    add             dword [psqsum16x16], 4
 
-  mov             ebp,    [psqdiff16x16]
-  pshufd  xmm5,   xmm4,   14      ; 00001110
-  paddd   xmm5,   xmm4
-  pshufd  xmm4,   xmm5,   1       ; 00000001
-  paddd   xmm4,   xmm5
-  movd    [ebp],  xmm4
-  add             dword   [psqdiff16x16], 4
+    mov             ebp,    [psqdiff16x16]
+    pshufd  xmm5,   xmm4,   14      ; 00001110
+    paddd   xmm5,   xmm4
+    pshufd  xmm4,   xmm5,   1       ; 00000001
+    paddd   xmm4,   xmm5
+    movd    [ebp],  xmm4
+    add             dword   [psqdiff16x16], 4
 
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
+    add             edx,    16
+    sub             esi,    eax
+    sub             edi,    eax
+    add             esi,    16
+    add             edi,    16
 
-  dec             ecx
-  jnz             sqdiff_width_loop
+    dec             ecx
+    jnz             sqdiff_width_loop
 
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
+    mov             esi,    [tmp_esi]
+    mov             edi,    [tmp_edi]
+    add             esi,    eax
+    add             edi,    eax
 
-  dec     dword [iPicHeight]
-  jnz             sqdiff_height_loop
+    dec     dword [iPicHeight]
+    jnz             sqdiff_height_loop
 
-  mov             ebx,    [tmp_sadframe]
-  mov             eax,    [psadframe]
-  mov             [eax],  ebx
+    mov             ebx,    [tmp_sadframe]
+    mov             eax,    [psadframe]
+    mov             [eax],  ebx
 
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
+    add             esp,    localsize
+    pop             ebx
+    pop             edi
+    pop             esi
+    pop             ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -1059,7 +1059,7 @@
 %undef          tmp_sadframe
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 %else
 
@@ -1083,128 +1083,128 @@
 %define         psqsum16x16                     arg9;
 %define         psqdiff16x16                    arg10
 
-  push r12
-  push r13
-  push r14
-  push r15
-  %assign push_num 4
-  PUSH_XMM 10
+    push r12
+    push r13
+    push r14
+    push r15
+    %assign push_num 4
+    PUSH_XMM 10
 
 %ifdef WIN64
-  mov r4,arg5
+    mov r4,arg5
 %endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    mov r14,arg7
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov        r13,r4
-  shr     r2,4   ; iPicWidth/16
-  shr     r3,4   ; iPicHeight/16
-  shl     r13,4   ; iPicStride*16
-  pxor    xmm0,   xmm0
-  pxor  xmm8, xmm8  ;framesad
-  pxor  xmm9, xmm9
+    mov        r13,r4
+    shr     r2,4   ; iPicWidth/16
+    shr     r3,4   ; iPicHeight/16
+    shl     r13,4   ; iPicStride*16
+    pxor    xmm0,   xmm0
+    pxor  xmm8, xmm8  ;framesad
+    pxor  xmm9, xmm9
 sqdiff_height_loop:
-  ;mov            ecx,    dword [iPicWidth]
-  ;mov      r14,r2
-  push r2
-  %assign push_num push_num +1
-  mov             r10,    r0
-  mov             r11,    r1
+    ;mov            ecx,    dword [iPicWidth]
+    ;mov      r14,r2
+    push r2
+    %assign push_num push_num +1
+    mov             r10,    r0
+    mov             r11,    r1
 sqdiff_width_loop:
-  pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
-  pxor    xmm6,   xmm6            ; pSum16x16
-  pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  movdqa  xmm1,           xmm7
-  movd    [r14],          xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [r14+4],        xmm7
-  movd    r15d,           xmm1
-  movd  xmm9, r15d
-  paddd xmm8,xmm9
+    pxor    xmm7,   xmm7            ; hiQuad_loQuad pSad8x8
+    pxor    xmm6,   xmm6            ; pSum16x16
+    pxor    xmm5,   xmm5            ; sqsum_16x16  four dword
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    movdqa  xmm1,           xmm7
+    movd    [r14],          xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [r14+4],        xmm7
+    movd    r15d,           xmm1
+    movd  xmm9, r15d
+    paddd xmm8,xmm9
 
 
-  pxor    xmm7,   xmm7
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
-  movdqa  xmm1,           xmm7
-  movd    [r14+8],        xmm7
-  psrldq  xmm7,           8
-  paddd   xmm1,           xmm7
-  movd    [r14+12],       xmm7
-  movd    r15d,           xmm1
-  movd  xmm9, r15d
-  paddd xmm8,xmm9
+    pxor    xmm7,   xmm7
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+    movdqa  xmm1,           xmm7
+    movd    [r14+8],        xmm7
+    psrldq  xmm7,           8
+    paddd   xmm1,           xmm7
+    movd    [r14+12],       xmm7
+    movd    r15d,           xmm1
+    movd  xmm9, r15d
+    paddd xmm8,xmm9
 
-  mov             r15,    psum16x16
-  movdqa  xmm1,   xmm6
-  psrldq  xmm1,   8
-  paddd   xmm6,   xmm1
-  movd    [r15],  xmm6
-  add             dword psum16x16, 4
+    mov             r15,    psum16x16
+    movdqa  xmm1,   xmm6
+    psrldq  xmm1,   8
+    paddd   xmm6,   xmm1
+    movd    [r15],  xmm6
+    add             dword psum16x16, 4
 
-  mov             r15,    psqsum16x16
-  pshufd  xmm6,   xmm5,   14 ;00001110
-  paddd   xmm6,   xmm5
-  pshufd  xmm5,   xmm6,   1  ;00000001
-  paddd   xmm5,   xmm6
-  movd    [r15],  xmm5
-  add             dword psqsum16x16, 4
+    mov             r15,    psqsum16x16
+    pshufd  xmm6,   xmm5,   14 ;00001110
+    paddd   xmm6,   xmm5
+    pshufd  xmm5,   xmm6,   1  ;00000001
+    paddd   xmm5,   xmm6
+    movd    [r15],  xmm5
+    add             dword psqsum16x16, 4
 
-  mov             r15,    psqdiff16x16
-  pshufd  xmm5,   xmm4,   14      ; 00001110
-  paddd   xmm5,   xmm4
-  pshufd  xmm4,   xmm5,   1       ; 00000001
-  paddd   xmm4,   xmm5
-  movd    [r15],  xmm4
-  add             dword   psqdiff16x16,   4
+    mov             r15,    psqdiff16x16
+    pshufd  xmm5,   xmm4,   14      ; 00001110
+    paddd   xmm5,   xmm4
+    pshufd  xmm4,   xmm5,   1       ; 00000001
+    paddd   xmm4,   xmm5
+    movd    [r15],  xmm4
+    add             dword   psqdiff16x16,   4
 
-  add             r14,16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
+    add             r14,16
+    sub             r0,     r13
+    sub             r1,     r13
+    add             r0,     16
+    add             r1,     16
 
-  dec             r2
-  jnz             sqdiff_width_loop
+    dec             r2
+    jnz             sqdiff_width_loop
 
-  pop r2
-  %assign push_num push_num -1
+    pop r2
+    %assign push_num push_num -1
 
-  mov             r0,     r10
-  mov             r1,     r11
-  add             r0,     r13
-  add             r1,     r13
+    mov             r0,     r10
+    mov             r1,     r11
+    add             r0,     r13
+    add             r1,     r13
 
-  dec     r3
-  jnz             sqdiff_height_loop
+    dec     r3
+    jnz             sqdiff_height_loop
 
-  mov             r13,    psadframe
-  movd    [r13],  xmm8
+    mov             r13,    psadframe
+    movd    [r13],  xmm8
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
-  %assign push_num 0
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    %assign push_num 0
 
 %undef          cur_data
 %undef          ref_data
@@ -1221,7 +1221,7 @@
 %undef          tmp_sadframe
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 
 
@@ -1249,145 +1249,145 @@
 %define         tmp_edi                         esp + 4
 %define         tmp_ecx                         esp + 8
 %define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             eax,    ebx
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+    sub             esp,    localsize
+    mov             esi,    [cur_data]
+    mov             edi,    [ref_data]
+    mov             ebx,    [iPicStride]
+    mov             eax,    ebx
 
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  xor             ebp,    ebp
-  pxor    xmm0,   xmm0
+    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl             eax,    4                                                       ; iPicStride*16
+    xor             ebp,    ebp
+    pxor    xmm0,   xmm0
 bgd_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
+    mov             ecx,    dword [iPicWidth]
+    mov             [tmp_esi],      esi
+    mov             [tmp_edi],      edi
 bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8
-  pxor    xmm6,   xmm6            ; sum_cur_8x8
-  pxor    xmm5,   xmm5            ; sum_ref_8x8
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    pxor    xmm7,   xmm7            ; pSad8x8
+    pxor    xmm6,   xmm6            ; sum_cur_8x8
+    pxor    xmm5,   xmm5            ; sum_ref_8x8
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
 
 
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
+    mov                     edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm4
 
-  ;movdqa         xmm1,   xmm4
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm4,   xmm0
-  ;punpcklwd      xmm4,   xmm0
-  ;movd           [edx+4],        xmm4
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  mov                     [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm4
-  movd            ecx,    xmm4
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    ;movdqa         xmm1,   xmm4
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm4,   xmm0
+    ;punpcklwd      xmm4,   xmm0
+    ;movd           [edx+4],        xmm4
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    mov                     [tmp_ecx],      ecx
+    movhlps         xmm1,   xmm4
+    movd            ecx,    xmm4
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
 
-  pslldq          xmm7,   4
-  pslldq          xmm6,   4
-  pslldq          xmm5,   4
+    pslldq          xmm7,   4
+    pslldq          xmm6,   4
+    pslldq          xmm5,   4
 
 
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,esi ,edi, ebx
 
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
+    mov                     edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm4
 
-  ;movdqa         xmm1,   xmm4
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm4,   xmm0
-  ;punpcklwd      xmm4,   xmm0
-  ;movd           [edx+4],        xmm4
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  movhlps         xmm1,   xmm4
-  movd            ecx,    xmm4
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    ;movdqa         xmm1,   xmm4
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm4,   xmm0
+    ;punpcklwd      xmm4,   xmm0
+    ;movd           [edx+4],        xmm4
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    movhlps         xmm1,   xmm4
+    movd            ecx,    xmm4
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
-  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+    ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
 
-  mov             edx,    [psad8x8]
-  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
-  movdqa  [edx],  xmm1
-  add             edx,    16
-  mov             [psad8x8],      edx                                     ; sad8x8
+    mov             edx,    [psad8x8]
+    pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+    movdqa  [edx],  xmm1
+    add             edx,    16
+    mov             [psad8x8],      edx                                     ; sad8x8
 
-  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
-  pshufd  xmm2,   xmm1,   00000011b
-  paddd   xmm1,   xmm2
-  movd    edx,    xmm1
-  add             ebp,    edx                                             ; sad frame
+    paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+    pshufd  xmm2,   xmm1,   00000011b
+    paddd   xmm1,   xmm2
+    movd    edx,    xmm1
+    add             ebp,    edx                                             ; sad frame
 
-  mov             edx,    [p_sd8x8]
-  psubd   xmm6,   xmm5
-  pshufd  xmm1,   xmm6,   10001101b
-  movdqa  [edx],  xmm1
-  add             edx,    16
-  mov             [p_sd8x8],      edx
+    mov             edx,    [p_sd8x8]
+    psubd   xmm6,   xmm5
+    pshufd  xmm1,   xmm6,   10001101b
+    movdqa  [edx],  xmm1
+    add             edx,    16
+    mov             [p_sd8x8],      edx
 
 
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
+    add             edx,    16
+    sub             esi,    eax
+    sub             edi,    eax
+    add             esi,    16
+    add             edi,    16
 
-  mov             ecx,    [tmp_ecx]
-  dec             ecx
-  jnz             bgd_width_loop
+    mov             ecx,    [tmp_ecx]
+    dec             ecx
+    jnz             bgd_width_loop
 
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
+    mov             esi,    [tmp_esi]
+    mov             edi,    [tmp_edi]
+    add             esi,    eax
+    add             edi,    eax
 
-  dec             dword [iPicHeight]
-  jnz             bgd_height_loop
+    dec             dword [iPicHeight]
+    jnz             bgd_height_loop
 
-  mov             edx,    [psadframe]
-  mov             [edx],  ebp
+    mov             edx,    [psadframe]
+    mov             [edx],  ebp
 
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
+    add             esp,    localsize
+    pop             ebx
+    pop             edi
+    pop             esi
+    pop             ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -1401,7 +1401,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 
 
@@ -1431,190 +1431,190 @@
 %define         tmp_sadframe            esp + 8
 %define         tmp_ecx                         esp + 12
 %define         pushsize                16
-  push    ebp
-  push    esi
-  push    edi
-  push    ebx
-  sub             esp,    localsize
-  mov             esi,    [cur_data]
-  mov             edi,    [ref_data]
-  mov             ebx,    [iPicStride]
-  mov             eax,    ebx
+    push    ebp
+    push    esi
+    push    edi
+    push    ebx
+    sub             esp,    localsize
+    mov             esi,    [cur_data]
+    mov             edi,    [ref_data]
+    mov             ebx,    [iPicStride]
+    mov             eax,    ebx
 
-  shr             dword [iPicWidth],      4                                       ; iPicWidth/16
-  shr             dword [iPicHeight],     4                                       ; iPicHeight/16
-  shl             eax,    4                                                       ; iPicStride*16
-  pxor    xmm0,   xmm0
-  movd    [tmp_sadframe], xmm0
+    shr             dword [iPicWidth],      4                                       ; iPicWidth/16
+    shr             dword [iPicHeight],     4                                       ; iPicHeight/16
+    shl             eax,    4                                                       ; iPicStride*16
+    pxor    xmm0,   xmm0
+    movd    [tmp_sadframe], xmm0
 sqdiff_bgd_height_loop:
-  mov             ecx,    dword [iPicWidth]
-  mov             [tmp_esi],      esi
-  mov             [tmp_edi],      edi
+    mov             ecx,    dword [iPicWidth]
+    mov             [tmp_esi],      esi
+    mov             [tmp_edi],      edi
 sqdiff_bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
 
-  mov             edx,            [psad8x8]
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [edx],          xmm2
-  movd    [edx+4],        xmm1
-  add             edx,            8
-  mov             [psad8x8],      edx                     ; sad8x8
+    mov             edx,            [psad8x8]
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [edx],          xmm2
+    movd    [edx+4],        xmm1
+    add             edx,            8
+    mov             [psad8x8],      edx                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    edx,                            xmm1
-  add             [tmp_sadframe],         edx                     ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    edx,                            xmm1
+    add             [tmp_sadframe],         edx                     ; iFrameSad
 
-  mov             edx,            [psum16x16]
-  movdqa  xmm1,           xmm6
-  pshufd  xmm2,           xmm1,           00001110b
-  paddd   xmm1,           xmm2
-  movd    [edx],          xmm1                            ; sum
+    mov             edx,            [psum16x16]
+    movdqa  xmm1,           xmm6
+    pshufd  xmm2,           xmm1,           00001110b
+    paddd   xmm1,           xmm2
+    movd    [edx],          xmm1                            ; sum
 
-  mov             edx,            [p_sd8x8]
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [edx],          xmm1
-  add             edx,            8
-  mov             [p_sd8x8],      edx
+    mov             edx,            [p_sd8x8]
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [edx],          xmm1
+    add             edx,            8
+    mov             [p_sd8x8],      edx
 
-  mov                     edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm5
-  ;movdqa         xmm1,   xmm5
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm5,   xmm0
-  ;punpcklwd      xmm5,   xmm0
-  ;movd           [edx+4],        xmm5
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  mov                     [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm5
-  movd            ecx,    xmm5
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    mov                     edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm5
+    ;movdqa         xmm1,   xmm5
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm5,   xmm0
+    ;punpcklwd      xmm5,   xmm0
+    ;movd           [edx+4],        xmm5
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    mov                     [tmp_ecx],      ecx
+    movhlps         xmm1,   xmm5
+    movd            ecx,    xmm5
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
-  psrlq   xmm7,   32
-  psllq   xmm7,   32                      ; clear sad
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    psrlq   xmm7,   32
+    psllq   xmm7,   32                      ; clear sad
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, esi , edi , ebx
 
-  mov             edx,            [psad8x8]
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [edx],          xmm2
-  movd    [edx+4],        xmm1
-  add             edx,            8
-  mov             [psad8x8],      edx                     ; sad8x8
+    mov             edx,            [psad8x8]
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [edx],          xmm2
+    movd    [edx+4],        xmm1
+    add             edx,            8
+    mov             [psad8x8],      edx                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    edx,                            xmm1
-  add             [tmp_sadframe],         edx                     ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    edx,                            xmm1
+    add             [tmp_sadframe],         edx                     ; iFrameSad
 
-  mov             edx,                    [psum16x16]
-  movdqa  xmm1,                   xmm6
-  pshufd  xmm2,                   xmm1,           00001110b
-  paddd   xmm1,                   xmm2
-  movd    ebp,                    xmm1                            ; sum
-  add             [edx],                  ebp
-  add             edx,                    4
-  mov             [psum16x16],    edx
+    mov             edx,                    [psum16x16]
+    movdqa  xmm1,                   xmm6
+    pshufd  xmm2,                   xmm1,           00001110b
+    paddd   xmm1,                   xmm2
+    movd    ebp,                    xmm1                            ; sum
+    add             [edx],                  ebp
+    add             edx,                    4
+    mov             [psum16x16],    edx
 
-  mov             edx,                    [psqsum16x16]
-  psrlq   xmm7,                   32
-  pshufd  xmm2,                   xmm7,           00001110b
-  paddd   xmm2,                   xmm7
-  movd    [edx],                  xmm2                            ; sqsum
-  add             edx,                    4
-  mov             [psqsum16x16],  edx
+    mov             edx,                    [psqsum16x16]
+    psrlq   xmm7,                   32
+    pshufd  xmm2,                   xmm7,           00001110b
+    paddd   xmm2,                   xmm7
+    movd    [edx],                  xmm2                            ; sqsum
+    add             edx,                    4
+    mov             [psqsum16x16],  edx
 
-  mov             edx,            [p_sd8x8]
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [edx],          xmm1
-  add             edx,            8
-  mov             [p_sd8x8],      edx
+    mov             edx,            [p_sd8x8]
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [edx],          xmm1
+    add             edx,            8
+    mov             [p_sd8x8],      edx
 
-  mov             edx,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm5
-  ;movdqa         xmm1,   xmm5
-  ;punpcklbw      xmm1,   xmm0
-  ;punpcklwd      xmm1,   xmm0
-  ;movd           [edx],  xmm1
-  ;punpckhbw      xmm5,   xmm0
-  ;punpcklwd      xmm5,   xmm0
-  ;movd           [edx+4],        xmm5
-  ;add                    edx,            8
-  ;mov                    [p_mad8x8],     edx
-  movhlps         xmm1,   xmm5
-  movd            ecx,    xmm5
-  mov                     [edx],  cl
-  movd            ecx,    xmm1
-  mov                     [edx+1],cl
-  add                     edx,    2
-  mov                     [p_mad8x8],     edx
+    mov             edx,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm5
+    ;movdqa         xmm1,   xmm5
+    ;punpcklbw      xmm1,   xmm0
+    ;punpcklwd      xmm1,   xmm0
+    ;movd           [edx],  xmm1
+    ;punpckhbw      xmm5,   xmm0
+    ;punpcklwd      xmm5,   xmm0
+    ;movd           [edx+4],        xmm5
+    ;add                    edx,            8
+    ;mov                    [p_mad8x8],     edx
+    movhlps         xmm1,   xmm5
+    movd            ecx,    xmm5
+    mov                     [edx],  cl
+    movd            ecx,    xmm1
+    mov                     [edx+1],cl
+    add                     edx,    2
+    mov                     [p_mad8x8],     edx
 
-  mov             edx,            [psqdiff16x16]
-  pshufd  xmm1,           xmm4,           00001110b
-  paddd   xmm4,           xmm1
-  pshufd  xmm1,           xmm4,           00000001b
-  paddd   xmm4,           xmm1
-  movd    [edx],          xmm4
-  add             edx,            4
-  mov             [psqdiff16x16], edx
+    mov             edx,            [psqdiff16x16]
+    pshufd  xmm1,           xmm4,           00001110b
+    paddd   xmm4,           xmm1
+    pshufd  xmm1,           xmm4,           00000001b
+    paddd   xmm4,           xmm1
+    movd    [edx],          xmm4
+    add             edx,            4
+    mov             [psqdiff16x16], edx
 
-  add             edx,    16
-  sub             esi,    eax
-  sub             edi,    eax
-  add             esi,    16
-  add             edi,    16
+    add             edx,    16
+    sub             esi,    eax
+    sub             edi,    eax
+    add             esi,    16
+    add             edi,    16
 
-  mov             ecx,    [tmp_ecx]
-  dec             ecx
-  jnz             sqdiff_bgd_width_loop
+    mov             ecx,    [tmp_ecx]
+    dec             ecx
+    jnz             sqdiff_bgd_width_loop
 
-  mov             esi,    [tmp_esi]
-  mov             edi,    [tmp_edi]
-  add             esi,    eax
-  add             edi,    eax
+    mov             esi,    [tmp_esi]
+    mov             edi,    [tmp_edi]
+    add             esi,    eax
+    add             edi,    eax
 
-  dec     dword [iPicHeight]
-  jnz             sqdiff_bgd_height_loop
+    dec     dword [iPicHeight]
+    jnz             sqdiff_bgd_height_loop
 
-  mov             edx,    [psadframe]
-  mov             ebp,    [tmp_sadframe]
-  mov             [edx],  ebp
+    mov             edx,    [psadframe]
+    mov             ebp,    [tmp_sadframe]
+    mov             [edx],  ebp
 
-  add             esp,    localsize
-  pop             ebx
-  pop             edi
-  pop             esi
-  pop             ebp
+    add             esp,    localsize
+    pop             ebx
+    pop             edi
+    pop             esi
+    pop             ebp
 %undef          cur_data
 %undef          ref_data
 %undef          iPicWidth
@@ -1631,7 +1631,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-   ret
+    ret
 %else
 
 ;*************************************************************************************************************
@@ -1651,142 +1651,142 @@
 %define         p_sd8x8                         arg8;
 %define         p_mad8x8                        arg9;
 
-  push r12
-  push r13
-  push r14
-  push r15
+    push r12
+    push r13
+    push r14
+    push r15
 %assign push_num 4
-  PUSH_XMM 10
+    PUSH_XMM 10
 %ifdef WIN64
-  mov r4,arg5
-  ;  mov r5,arg6
+    mov r4,arg5
+    ;  mov r5,arg6
 %endif
-  mov r14,arg7
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    mov r14,arg7
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
 
-  mov     r13,r4
-  mov     r15,r0
-  shr     r2,4
-  shr     r3,4
-  shl     r13,4
-  pxor    xmm0,   xmm0
-  pxor    xmm8,   xmm8
-  pxor    xmm9,   xmm9
+    mov     r13,r4
+    mov     r15,r0
+    shr     r2,4
+    shr     r3,4
+    shl     r13,4
+    pxor    xmm0,   xmm0
+    pxor    xmm8,   xmm8
+    pxor    xmm9,   xmm9
 bgd_height_loop:
-  ;mov            ecx,    dword [iPicWidth]
-  push r2
-  %assign push_num push_num+1
-  mov             r10,    r15
-  mov             r11,    r1
+    ;mov            ecx,    dword [iPicWidth]
+    push r2
+    %assign push_num push_num+1
+    mov             r10,    r15
+    mov             r11,    r1
 bgd_width_loop:
-  pxor    xmm7,   xmm7            ; pSad8x8
-  pxor    xmm6,   xmm6            ; sum_cur_8x8
-  pxor    xmm5,   xmm5            ; sum_ref_8x8
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    pxor    xmm7,   xmm7            ; pSad8x8
+    pxor    xmm6,   xmm6            ; sum_cur_8x8
+    pxor    xmm5,   xmm5            ; sum_ref_8x8
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
 
 
-  mov                     r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm4
+    mov                     r14,            p_mad8x8
+    WELS_MAX_REG_SSE2       xmm4
 
-  ;mov                    [tmp_ecx],      ecx
-  movhlps         xmm1,   xmm4
-  movd            r0d,    xmm4
+    ;mov                    [tmp_ecx],      ecx
+    movhlps         xmm1,   xmm4
+    movd            r0d,    xmm4
 
 
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  add                     r14,    2
-  ;mov                     p_mad8x8,       r14
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    add                     r14,    2
+    ;mov                     p_mad8x8,       r14
 
 
-  pslldq          xmm7,   4
-  pslldq          xmm6,   4
-  pslldq          xmm5,   4
+    pslldq          xmm7,   4
+    pslldq          xmm6,   4
+    pslldq          xmm5,   4
 
 
-  pxor    xmm4,   xmm4            ; pMad8x8
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
-  WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    pxor    xmm4,   xmm4            ; pMad8x8
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
+    WELS_SAD_SD_MAD_16x1_SSE2       xmm7,   xmm6,   xmm5,   xmm4 ,r15 ,r1, r4
 
-  ;mov                     r14,            [p_mad8x8]
-  WELS_MAX_REG_SSE2       xmm4
+    ;mov                     r14,            [p_mad8x8]
+    WELS_MAX_REG_SSE2       xmm4
 
-  movhlps         xmm1,   xmm4
-  movd            r0d,    xmm4
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
+    movhlps         xmm1,   xmm4
+    movd            r0d,    xmm4
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    add                     r14,    2
+    mov                     p_mad8x8,       r14
 
-  ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+    ; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
 
-  mov             r14,    psad8x8
-  pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
-  movdqa  [r14],  xmm1
-  add             r14,    16
-  mov             psad8x8,        r14                                     ; sad8x8
+    mov             r14,    psad8x8
+    pshufd  xmm1,   xmm7,   10001101b               ; D3 D2 D1 D0
+    movdqa  [r14],  xmm1
+    add             r14,    16
+    mov             psad8x8,        r14                                     ; sad8x8
 
-  paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
-  pshufd  xmm2,   xmm1,   00000011b
-  paddd   xmm1,   xmm2
-  movd    r14d,   xmm1
-  movd    xmm9, r14d
-  paddd   xmm8,   xmm9                                            ; sad frame
+    paddd   xmm1,   xmm7                                    ; D1+3 D3+2 D0+1 D2+0
+    pshufd  xmm2,   xmm1,   00000011b
+    paddd   xmm1,   xmm2
+    movd    r14d,   xmm1
+    movd    xmm9, r14d
+    paddd   xmm8,   xmm9                                            ; sad frame
 
-  mov             r14,    p_sd8x8
-  psubd   xmm6,   xmm5
-  pshufd  xmm1,   xmm6,   10001101b
-  movdqa  [r14],  xmm1
-  add             r14,    16
-  mov             p_sd8x8,        r14
+    mov             r14,    p_sd8x8
+    psubd   xmm6,   xmm5
+    pshufd  xmm1,   xmm6,   10001101b
+    movdqa  [r14],  xmm1
+    add             r14,    16
+    mov             p_sd8x8,        r14
 
 
-  ;add            edx,    16
-  sub             r15,    r13
-  sub             r1,     r13
-  add             r15,    16
-  add             r1,     16
+    ;add            edx,    16
+    sub             r15,    r13
+    sub             r1,     r13
+    add             r15,    16
+    add             r1,     16
 
 
-  dec             r2
-  jnz             bgd_width_loop
-  pop     r2
+    dec             r2
+    jnz             bgd_width_loop
+    pop     r2
 %assign push_num push_num-1
-  mov             r15,    r10
-  mov             r1,     r11
-  add             r15,    r13
-  add             r1,     r13
+    mov             r15,    r10
+    mov             r1,     r11
+    add             r15,    r13
+    add             r1,     r13
 
-  dec             r3
-  jnz             bgd_height_loop
+    dec             r3
+    jnz             bgd_height_loop
 
-  mov             r13,    psadframe
-  movd    [r13],  xmm8
+    mov             r13,    psadframe
+    movd    [r13],  xmm8
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 %assign push_num 0
 %undef          cur_data
 %undef          ref_data
@@ -1801,7 +1801,7 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 
 
 
@@ -1826,189 +1826,189 @@
 %define         p_sd8x8                         arg11
 %define         p_mad8x8                        arg12
 
-  push r12
-  push r13
-  push r14
-  push r15
+    push r12
+    push r13
+    push r14
+    push r15
 %assign push_num 4
-  PUSH_XMM 10
+    PUSH_XMM 10
 %ifdef WIN64
-  mov r4,arg5
-  ;mov r5,arg6
+    mov r4,arg5
+    ;mov r5,arg6
 %endif
-  SIGN_EXTENSION r2,r2d
-  SIGN_EXTENSION r3,r3d
-  SIGN_EXTENSION r4,r4d
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+    SIGN_EXTENSION r4,r4d
 
-  mov     r13,r4
-  shr             r2,     4                                       ; iPicWidth/16
-  shr             r3,     4                                       ; iPicHeight/16
-  shl             r13,    4                                                       ; iPicStride*16
-  pxor    xmm0,   xmm0
-  pxor    xmm8,   xmm8
-  pxor    xmm9,   xmm9
+    mov     r13,r4
+    shr             r2,     4                                       ; iPicWidth/16
+    shr             r3,     4                                       ; iPicHeight/16
+    shl             r13,    4                                                       ; iPicStride*16
+    pxor    xmm0,   xmm0
+    pxor    xmm8,   xmm8
+    pxor    xmm9,   xmm9
 
 
 sqdiff_bgd_height_loop:
-  mov             r10,    r0
-  mov             r11,    r1
-  push r2
+    mov             r10,    r0
+    mov             r11,    r1
+    push r2
 %assign push_num push_num+1
 sqdiff_bgd_width_loop:
 
-  pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    pxor    xmm7,   xmm7            ; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    pxor    xmm4,   xmm4            ; sqdiff_16x16  four Dword
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
 
-  mov             r14,            psad8x8
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [r14],          xmm2
-  movd    [r14+4],        xmm1
-  add             r14,            8
-  mov             psad8x8,        r14                     ; sad8x8
+    mov             r14,            psad8x8
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [r14],          xmm2
+    movd    [r14+4],        xmm1
+    add             r14,            8
+    mov             psad8x8,        r14                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    r14d,                           xmm1
-  movd    xmm9,r14d
-  paddd           xmm8,           xmm9                    ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    r14d,                           xmm1
+    movd    xmm9,r14d
+    paddd           xmm8,           xmm9                    ; iFrameSad
 
-  mov             r14,            psum16x16
-  movdqa  xmm1,           xmm6
-  pshufd  xmm2,           xmm1,           00001110b
-  paddd   xmm1,           xmm2
-  movd    [r14],          xmm1                            ; sum
+    mov             r14,            psum16x16
+    movdqa  xmm1,           xmm6
+    pshufd  xmm2,           xmm1,           00001110b
+    paddd   xmm1,           xmm2
+    movd    [r14],          xmm1                            ; sum
 
-  mov             r14,            p_sd8x8
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [r14],          xmm1
-  add             r14,            8
-  mov             p_sd8x8,        r14
+    mov             r14,            p_sd8x8
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [r14],          xmm1
+    add             r14,            8
+    mov             p_sd8x8,        r14
 
-  mov                     r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm5
+    mov                     r14,            p_mad8x8
+    WELS_MAX_REG_SSE2       xmm5
 
-  movhlps         xmm1,   xmm5
-  push r0
-  movd            r0d,    xmm5
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  pop r0
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
+    movhlps         xmm1,   xmm5
+    push r0
+    movd            r0d,    xmm5
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    pop r0
+    add                     r14,    2
+    mov                     p_mad8x8,       r14
 
-  psrlq   xmm7,   32
-  psllq   xmm7,   32                      ; clear sad
-  pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-  pxor    xmm5,   xmm5            ; pMad8x8
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
-  WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    psrlq   xmm7,   32
+    psllq   xmm7,   32                      ; clear sad
+    pxor    xmm6,   xmm6            ; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+    pxor    xmm5,   xmm5            ; pMad8x8
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
+    WELS_SAD_BGD_SQDIFF_16x1_SSE2   xmm7,   xmm6,   xmm5,   xmm4, r0 , r1 , r4
 
-  mov             r14,            psad8x8
-  movdqa  xmm2,           xmm7
-  pshufd  xmm1,           xmm2,           00001110b
-  movd    [r14],          xmm2
-  movd    [r14+4],        xmm1
-  add             r14,            8
-  mov             psad8x8,        r14                     ; sad8x8
+    mov             r14,            psad8x8
+    movdqa  xmm2,           xmm7
+    pshufd  xmm1,           xmm2,           00001110b
+    movd    [r14],          xmm2
+    movd    [r14+4],        xmm1
+    add             r14,            8
+    mov             psad8x8,        r14                     ; sad8x8
 
-  paddd   xmm1,                           xmm2
-  movd    r14d,                           xmm1
-  movd    xmm9, r14d
-  paddd   xmm8,           xmm9            ; iFrameSad
+    paddd   xmm1,                           xmm2
+    movd    r14d,                           xmm1
+    movd    xmm9, r14d
+    paddd   xmm8,           xmm9            ; iFrameSad
 
-  mov             r14,                    psum16x16
-  movdqa  xmm1,                   xmm6
-  pshufd  xmm2,                   xmm1,           00001110b
-  paddd   xmm1,                   xmm2
-  movd    r15d,                   xmm1                            ; sum
-  add             [r14],                  r15d
-  add             r14,                    4
-  mov             psum16x16,      r14
+    mov             r14,                    psum16x16
+    movdqa  xmm1,                   xmm6
+    pshufd  xmm2,                   xmm1,           00001110b
+    paddd   xmm1,                   xmm2
+    movd    r15d,                   xmm1                            ; sum
+    add             [r14],                  r15d
+    add             r14,                    4
+    mov             psum16x16,      r14
 
-  mov             r14,                    psqsum16x16
-  psrlq   xmm7,                   32
-  pshufd  xmm2,                   xmm7,           00001110b
-  paddd   xmm2,                   xmm7
-  movd    [r14],                  xmm2                            ; sqsum
-  add             r14,                    4
-  mov             psqsum16x16,    r14
+    mov             r14,                    psqsum16x16
+    psrlq   xmm7,                   32
+    pshufd  xmm2,                   xmm7,           00001110b
+    paddd   xmm2,                   xmm7
+    movd    [r14],                  xmm2                            ; sqsum
+    add             r14,                    4
+    mov             psqsum16x16,    r14
 
-  mov             r14,            p_sd8x8
-  pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
-  psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
-  pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
-  movq    [r14],          xmm1
-  add             r14,            8
-  mov             p_sd8x8,        r14
+    mov             r14,            p_sd8x8
+    pshufd  xmm1,           xmm6,           11110101b                       ; Sref1 Sref1 Sref0 Sref0
+    psubd   xmm6,           xmm1            ; 00 diff1 00 diff0
+    pshufd  xmm1,           xmm6,           00001000b                       ;  xx xx diff1 diff0
+    movq    [r14],          xmm1
+    add             r14,            8
+    mov             p_sd8x8,        r14
 
-  mov             r14,            p_mad8x8
-  WELS_MAX_REG_SSE2       xmm5
+    mov             r14,            p_mad8x8
+    WELS_MAX_REG_SSE2       xmm5
 
 
-  movhlps         xmm1,   xmm5
-  push r0
-  movd            r0d,    xmm5
-  mov                     [r14],  r0b
-  movd            r0d,    xmm1
-  mov                     [r14+1],r0b
-  pop r0
-  add                     r14,    2
-  mov                     p_mad8x8,       r14
+    movhlps         xmm1,   xmm5
+    push r0
+    movd            r0d,    xmm5
+    mov                     [r14],  r0b
+    movd            r0d,    xmm1
+    mov                     [r14+1],r0b
+    pop r0
+    add                     r14,    2
+    mov                     p_mad8x8,       r14
 
-  mov             r14,            psqdiff16x16
-  pshufd  xmm1,           xmm4,           00001110b
-  paddd   xmm4,           xmm1
-  pshufd  xmm1,           xmm4,           00000001b
-  paddd   xmm4,           xmm1
-  movd    [r14],          xmm4
-  add             r14,            4
-  mov             psqdiff16x16,   r14
+    mov             r14,            psqdiff16x16
+    pshufd  xmm1,           xmm4,           00001110b
+    paddd   xmm4,           xmm1
+    pshufd  xmm1,           xmm4,           00000001b
+    paddd   xmm4,           xmm1
+    movd    [r14],          xmm4
+    add             r14,            4
+    mov             psqdiff16x16,   r14
 
-  add             r14,    16
-  sub             r0,     r13
-  sub             r1,     r13
-  add             r0,     16
-  add             r1,     16
+    add             r14,    16
+    sub             r0,     r13
+    sub             r1,     r13
+    add             r0,     16
+    add             r1,     16
 
-  dec             r2
-  jnz             sqdiff_bgd_width_loop
-  pop r2
-  %assign push_num push_num-1
-  mov             r0,     r10
-  mov             r1,     r11
-  add             r0,     r13
-  add             r1,     r13
+    dec             r2
+    jnz             sqdiff_bgd_width_loop
+    pop r2
+    %assign push_num push_num-1
+    mov             r0,     r10
+    mov             r1,     r11
+    add             r0,     r13
+    add             r1,     r13
 
-  dec     r3
-  jnz             sqdiff_bgd_height_loop
+    dec     r3
+    jnz             sqdiff_bgd_height_loop
 
-  mov             r14,    psadframe
-  movd    [r14],  xmm8
+    mov             r14,    psadframe
+    movd    [r14],  xmm8
 
-  POP_XMM
-  pop r15
-  pop r14
-  pop r13
-  pop r12
+    POP_XMM
+    pop r15
+    pop r14
+    pop r13
+    pop r12
 %assign push_num 0
 %undef          cur_data
 %undef          ref_data
@@ -2026,5 +2026,5 @@
 %undef          tmp_edi
 %undef          pushsize
 %undef          localsize
-  ret
+    ret
 %endif