shithub: openh264

Download patch

ref: 37e1c293e51b40f24f0a07c21cdfb9bf8d2ae451
parent: 7313ecdbd058561b0188785cdc8786cbd63207ce
parent: 31a93de513674bb1a166490ec8d824cadc0c4a97
author: huili2 <[email protected]>
date: Wed Mar 19 07:11:22 EDT 2014

Merge pull request #540 from licaiguo/reorgcommon-pr

reorganize common to inc/src/x86/arm

--- a/Makefile
+++ b/Makefile
@@ -57,9 +57,7 @@
 endif
 
 
-INCLUDES = -Icodec/api/svc -Icodec/common
-#ASM_INCLUDES = -Iprocessing/src/asm/
-ASM_INCLUDES = -Icodec/common/
+INCLUDES = -Icodec/api/svc -Icodec/common/inc
 
 DECODER_INCLUDES = \
     -Icodec/decoder/core/inc \
@@ -81,7 +79,7 @@
 CODEC_UNITTEST_INCLUDES += \
     -Igtest/include \
     -Icodec/processing/interface \
-    -Icodec/common \
+    -Icodec/common/inc \
     -Icodec/encoder/core/inc
 
 H264DEC_INCLUDES = $(DECODER_INCLUDES) -Icodec/console/dec/inc
--- a/build/platform-arch.mk
+++ b/build/platform-arch.mk
@@ -4,6 +4,7 @@
 ifneq ($(filter-out arm64, $(filter arm%, $(ARCH))),)
 ifeq ($(USE_ASM), Yes)
 ASM_ARCH = arm
+ASMFLAGS += -Icodec/common/arm/
 CFLAGS += -DHAVE_NEON
 endif
 endif
--- a/build/platform-x86-common.mk
+++ b/build/platform-x86-common.mk
@@ -1,5 +1,6 @@
 CFLAGS_M32=-m32
 CFLAGS_M64=-m64
+ASM_INCLUDES = -Icodec/common/x86/
 ifeq (, $(ENABLE64BIT))
 ifeq ($(ARCH), x86_64)
 ENABLE64BIT=Yes
--- a/codec/build/android/dec/jni/welsdecdemo.mk
+++ b/codec/build/android/dec/jni/welsdecdemo.mk
@@ -24,7 +24,7 @@
             $(CONSOLE_DEC_PATH)/src/h264dec.cpp \
             $(CONSOLE_DEC_PATH)/src/read_config.cpp \
             $(CONSOLE_DEC_PATH)/src/d3d9_utils.cpp \
-            $(CODEC_PATH)/common/logging.cpp \
+            $(CODEC_PATH)/common/src/logging.cpp \
             myjni.cpp
 #
 # Header Includes
@@ -32,7 +32,7 @@
 LOCAL_C_INCLUDES := \
             $(LOCAL_PATH)/../../../../api/svc \
             $(LOCAL_PATH)/../../../../console/dec/inc \
-            $(LOCAL_PATH)/../../../../common
+            $(LOCAL_PATH)/../../../../common/inc
 #
 # Compile Flags and Link Libraries
 #
--- a/codec/build/android/enc/jni/welsencdemo.mk
+++ b/codec/build/android/enc/jni/welsencdemo.mk
@@ -23,7 +23,7 @@
 LOCAL_SRC_FILES := \
             $(CONSOLE_ENC_PATH)/src/welsenc.cpp \
             $(CONSOLE_ENC_PATH)/src/read_config.cpp \
-            $(CODEC_PATH)/common/logging.cpp \
+            $(CODEC_PATH)/common/src/logging.cpp \
             myjni.cpp
 
 #
@@ -34,7 +34,7 @@
             $(LOCAL_PATH)/../../../../console/enc/inc \
             $(LOCAL_PATH)/../../../../encoder/core/inc \
             $(LOCAL_PATH)/../../../../processing/interface \
-            $(LOCAL_PATH)/../../../../common
+            $(LOCAL_PATH)/../../../../common/inc
 
   
 #
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -53,7 +53,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -133,7 +133,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
 				PreprocessorDefinitions="WIN64;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -211,7 +211,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -290,7 +290,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc"
 				PreprocessorDefinitions="WIN64;_DEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -356,7 +356,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -365,7 +365,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -374,7 +374,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -383,13 +383,13 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\common\cpuid.asm"
+					RelativePath="..\..\..\common\x86\cpuid.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -396,7 +396,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -405,7 +405,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -414,7 +414,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -423,7 +423,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -436,7 +436,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -445,7 +445,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -454,7 +454,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -463,13 +463,13 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\common\deblock.asm"
+					RelativePath="..\..\..\common\x86\deblock.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -476,7 +476,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -485,7 +485,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -494,7 +494,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -503,13 +503,13 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\common\expand_picture.asm"
+					RelativePath="..\..\..\common\x86\expand_picture.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -516,7 +516,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -525,7 +525,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -534,7 +534,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -543,7 +543,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -556,7 +556,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -565,7 +565,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -574,7 +574,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -583,13 +583,13 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\common\mb_copy.asm"
+					RelativePath="..\..\..\common\x86\mb_copy.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -596,7 +596,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -605,7 +605,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -614,7 +614,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -623,13 +623,13 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\common\mc_chroma.asm"
+					RelativePath="..\..\..\common\x86\mc_chroma.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -636,7 +636,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -645,7 +645,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -654,7 +654,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -663,13 +663,13 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
 				</File>
 				<File
-					RelativePath="..\..\..\common\mc_luma.asm"
+					RelativePath="..\..\..\common\x86\mc_luma.asm"
 					>
 					<FileConfiguration
 						Name="Release|Win32"
@@ -676,7 +676,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -685,7 +685,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -694,7 +694,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -703,7 +703,7 @@
 						>
 						<Tool
 							Name="VCCustomBuildTool"
-							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+							CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 							Outputs="$(IntDir)\$(InputName).obj"
 						/>
 					</FileConfiguration>
@@ -726,11 +726,11 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\cpu.h"
+					RelativePath="..\..\..\common\inc\cpu.h"
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\cpu_core.h"
+					RelativePath="..\..\..\common\inc\cpu_core.h"
 					>
 				</File>
 				<File
@@ -738,7 +738,7 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\deblocking_common.h"
+					RelativePath="..\..\..\common\inc\deblocking_common.h"
 					>
 				</File>
 				<File
@@ -778,7 +778,7 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\expand_picture_common.h"
+					RelativePath="..\..\..\common\inc\expand_picture_common.h"
 					>
 				</File>
 				<File
@@ -790,11 +790,11 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\ls_defines.h"
+					RelativePath="..\..\..\common\inc\ls_defines.h"
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\macros.h"
+					RelativePath="..\..\..\common\inc\macros.h"
 					>
 				</File>
 				<File
@@ -810,11 +810,11 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\mc_common.h"
+					RelativePath="..\..\..\common\inc\mc_common.h"
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\measure_time.h"
+					RelativePath="..\..\..\common\inc\measure_time.h"
 					>
 				</File>
 				<File
@@ -862,7 +862,7 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\typedefs.h"
+					RelativePath="..\..\..\common\inc\typedefs.h"
 					>
 				</File>
 				<File
@@ -895,11 +895,11 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\cpu.cpp"
+					RelativePath="..\..\..\common\src\cpu.cpp"
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\crt_util_safe_x.cpp"
+					RelativePath="..\..\..\common\src\crt_util_safe_x.cpp"
 					>
 				</File>
 				<File
@@ -907,7 +907,7 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\deblocking_common.cpp"
+					RelativePath="..\..\..\common\src\deblocking_common.cpp"
 					>
 				</File>
 				<File
@@ -943,7 +943,7 @@
 					>
 				</File>
 				<File
-					RelativePath="..\..\..\common\logging.cpp"
+					RelativePath="..\..\..\common\src\logging.cpp"
 					>
 				</File>
 				<File
--- a/codec/build/win32/dec/WelsDecPlus.vcproj
+++ b/codec/build/win32/dec/WelsDecPlus.vcproj
@@ -53,7 +53,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -153,7 +153,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
 				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -252,7 +252,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -349,7 +349,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc"
 				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
--- a/codec/build/win32/dec/decConsole.vcproj
+++ b/codec/build/win32/dec/decConsole.vcproj
@@ -49,7 +49,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\encoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\encoder\core\inc"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -142,7 +142,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\encoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\encoder\core\inc"
 				PreprocessorDefinitions="WIN64;NDEBUG;_CONSOLE"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -233,7 +233,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -327,7 +327,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
+				AdditionalIncludeDirectories="..\..\..\console\dec\inc,..\..\..\api\svc,..\..\..\common\inc,..\..\..\decoder\core\inc,..\..\..\encoder\core\inc"
 				PreprocessorDefinitions="WIN64;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -443,7 +443,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\logging.cpp"
+				RelativePath="..\..\..\common\src\logging.cpp"
 				>
 			</File>
 			<File
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -52,7 +52,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;X86_ASM;MT_ENABLED"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -132,7 +132,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN64;_DEBUG;_LIB;X86_ASM;MT_ENABLED"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -214,7 +214,7 @@
 				InlineFunctionExpansion="2"
 				FavorSizeOrSpeed="1"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;X86_ASM;MT_ENABLED;"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -298,7 +298,7 @@
 				InlineFunctionExpansion="2"
 				FavorSizeOrSpeed="1"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..\..\..\common;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\common\inc;..\..\..\encoder\core\inc,..\..\..\api\svc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN64;NDEBUG;_LIB;MT_ENABLED;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -396,11 +396,11 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\cpu.cpp"
+				RelativePath="..\..\..\common\src\cpu.cpp"
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\crt_util_safe_x.cpp"
+				RelativePath="..\..\..\common\src\crt_util_safe_x.cpp"
 				>
 			</File>
 			<File
@@ -444,7 +444,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\deblocking_common.cpp"
+				RelativePath="..\..\..\common\src\deblocking_common.cpp"
 				>
 			</File>
 			<File
@@ -728,7 +728,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\logging.cpp"
+				RelativePath="..\..\..\common\src\logging.cpp"
 				>
 			</File>
 			<File
@@ -1424,7 +1424,7 @@
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\WelsThreadLib.cpp"
+				RelativePath="..\..\..\common\src\WelsThreadLib.cpp"
 				>
 			</File>
 		</Filter>
@@ -1445,11 +1445,11 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\cpu.h"
+				RelativePath="..\..\..\common\inc\cpu.h"
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\cpu_core.h"
+				RelativePath="..\..\..\common\inc\cpu_core.h"
 				>
 			</File>
 			<File
@@ -1457,7 +1457,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\deblocking_common.h"
+				RelativePath="..\..\..\common\inc\deblocking_common.h"
 				>
 			</File>
 			<File
@@ -1485,7 +1485,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\expand_picture_common.h"
+				RelativePath="..\..\..\common\inc\expand_picture_common.h"
 				>
 			</File>
 			<File
@@ -1497,11 +1497,11 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\ls_defines.h"
+				RelativePath="..\..\..\common\inc\ls_defines.h"
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\macros.h"
+				RelativePath="..\..\..\common\inc\macros.h"
 				>
 			</File>
 			<File
@@ -1513,7 +1513,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\mc_common.h"
+				RelativePath="..\..\..\common\inc\mc_common.h"
 				>
 			</File>
 			<File
@@ -1521,7 +1521,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\measure_time.h"
+				RelativePath="..\..\..\common\inc\measure_time.h"
 				>
 			</File>
 			<File
@@ -1637,7 +1637,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\typedefs.h"
+				RelativePath="..\..\..\common\inc\typedefs.h"
 				>
 			</File>
 			<File
@@ -1661,7 +1661,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\WelsThreadLib.h"
+				RelativePath="..\..\..\common\inc\WelsThreadLib.h"
 				>
 			</File>
 		</Filter>
@@ -1677,7 +1677,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1686,7 +1686,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1695,7 +1695,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1704,13 +1704,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\cpuid.asm"
+				RelativePath="..\..\..\common\x86\cpuid.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1726,7 +1726,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1735,7 +1735,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1744,7 +1744,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1757,7 +1757,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1766,7 +1766,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1775,7 +1775,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1784,13 +1784,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\deblock.asm"
+				RelativePath="..\..\..\common\x86\deblock.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1797,7 +1797,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1806,7 +1806,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1815,7 +1815,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1824,13 +1824,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\expand_picture.asm"
+				RelativePath="..\..\..\common\x86\expand_picture.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1837,7 +1837,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1846,7 +1846,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1855,7 +1855,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1864,7 +1864,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1877,7 +1877,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1886,7 +1886,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1895,7 +1895,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1904,13 +1904,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\mb_copy.asm"
+				RelativePath="..\..\..\common\x86\mb_copy.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1917,7 +1917,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1926,7 +1926,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1935,7 +1935,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1944,13 +1944,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\mc_chroma.asm"
+				RelativePath="..\..\..\common\x86\mc_chroma.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1957,7 +1957,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1966,7 +1966,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1975,7 +1975,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -1984,13 +1984,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\mc_luma.asm"
+				RelativePath="..\..\..\common\x86\mc_luma.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -1997,7 +1997,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2006,7 +2006,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2015,7 +2015,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2024,7 +2024,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2037,7 +2037,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2046,7 +2046,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2055,7 +2055,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2064,7 +2064,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2077,7 +2077,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2086,7 +2086,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2095,7 +2095,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2104,13 +2104,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\satd_sad.asm"
+				RelativePath="..\..\..\common\x86\satd_sad.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2117,7 +2117,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2126,7 +2126,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2135,7 +2135,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2144,7 +2144,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2157,7 +2157,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2166,7 +2166,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2175,7 +2175,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2184,13 +2184,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\vaa.asm"
+				RelativePath="..\..\..\common\x86\vaa.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -2197,7 +2197,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2206,7 +2206,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2215,7 +2215,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -2224,7 +2224,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
--- a/codec/build/win32/enc/WelsEncPlus.vcproj
+++ b/codec/build/win32/enc/WelsEncPlus.vcproj
@@ -52,7 +52,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -151,7 +151,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -254,7 +254,7 @@
 				FavorSizeOrSpeed="1"
 				EnableFiberSafeOptimizations="true"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -359,7 +359,7 @@
 				FavorSizeOrSpeed="1"
 				EnableFiberSafeOptimizations="true"
 				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common\inc;..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
 				StringPooling="true"
 				RuntimeLibrary="2"
--- a/codec/build/win32/enc/encConsole.vcproj
+++ b/codec/build/win32/enc/encConsole.vcproj
@@ -48,7 +48,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;MT_ENABLED;"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -143,7 +143,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;X86_ASM;MT_ENABLED;"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -238,7 +238,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN64;_DEBUG;_CONSOLE;MT_ENABLED"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -334,7 +334,7 @@
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				InlineFunctionExpansion="1"
-				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common,..\..\..\processing\interface"
+				AdditionalIncludeDirectories="..\..\..\console\enc\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api,..\..\..\encoder\core\inc,..\..\..\common\inc,..\..\..\processing\interface"
 				PreprocessorDefinitions="WIN64;NDEBUG;_CONSOLE;MT_ENABLED"
 				StringPooling="true"
 				RuntimeLibrary="2"
@@ -407,7 +407,7 @@
 			Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
 			>
 			<File
-				RelativePath="..\..\..\common\logging.cpp"
+				RelativePath="..\..\..\common\src\logging.cpp"
 				>
 			</File>
 			<File
--- a/codec/common/WelsThreadLib.cpp
+++ /dev/null
@@ -1,473 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	WelsThreadLib.c
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created
- *
- *************************************************************************************
- */
-
-
-#ifdef LINUX
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-#include <sched.h>
-#elif !defined(_WIN32)
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/param.h>
-#include <unistd.h>
-#ifdef __APPLE__
-#define HW_NCPU_NAME "hw.logicalcpu"
-#else
-#define HW_NCPU_NAME "hw.ncpu"
-#endif
-#endif
-#ifdef ANDROID_NDK
-#include <cpu-features.h>
-#endif
-
-#include "WelsThreadLib.h"
-#include <stdio.h>
-#include <stdlib.h>
-
-
-#ifdef  _WIN32
-
-#ifdef WINAPI_FAMILY
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define InitializeCriticalSection(x) InitializeCriticalSectionEx(x, 0, 0)
-#endif
-#endif
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
-  InitializeCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
-  EnterCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
-  LeaveCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
-  DeleteCriticalSection (mutex);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-#else /* _WIN32 */
-
-WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
-  return pthread_mutex_init (mutex, NULL);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
-  return pthread_mutex_lock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
-  return pthread_mutex_unlock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
-  return pthread_mutex_destroy (mutex);
-}
-
-#endif /* !_WIN32 */
-
-
-#ifdef MT_ENABLED
-
-#ifdef _WIN32
-
-void WelsSleep (uint32_t dwMilliseconds) {
-  Sleep (dwMilliseconds);
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT* event, const char* event_name) {
-  WELS_EVENT   h = CreateEvent (NULL, FALSE, FALSE, NULL);
-
-  if (h == NULL) {
-    return WELS_THREAD_ERROR_GENERAL;
-  }
-  *event = h;
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventSignal (WELS_EVENT* event) {
-  if (SetEvent (*event)) {
-    return WELS_THREAD_ERROR_OK;
-  }
-  return WELS_THREAD_ERROR_GENERAL;
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventWait (WELS_EVENT* event) {
-  return WaitForSingleObject (*event, INFINITE);
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
-  return WaitForSingleObject (*event, dwMilliseconds);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
-    WELS_EVENT* event_list, WELS_EVENT* master_event) {
-  // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
-  return WaitForMultipleObjects (nCount, event_list, FALSE, INFINITE);
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
-    WELS_EVENT* event_list, WELS_EVENT* master_event) {
-  // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
-  return WaitForMultipleObjects (nCount, event_list, TRUE, INFINITE);
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, const char* event_name) {
-  CloseHandle (*event);
-
-  *event = NULL;
-  return WELS_THREAD_ERROR_OK;
-}
-
-
-WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
-    void* arg, WELS_THREAD_ATTR attr) {
-  WELS_THREAD_HANDLE   h = CreateThread (NULL, 0, routine, arg, 0, NULL);
-
-  if (h == NULL) {
-    return WELS_THREAD_ERROR_GENERAL;
-  }
-  * thread = h;
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread) {
-  WaitForSingleObject (thread, INFINITE);
-  CloseHandle (thread);
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-
-WELS_THREAD_HANDLE        WelsThreadSelf() {
-  return GetCurrentThread();
-}
-
-WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
-  SYSTEM_INFO  si;
-
-  GetSystemInfo (&si);
-
-  pInfo->ProcessorCount = si.dwNumberOfProcessors;
-
-  return WELS_THREAD_ERROR_OK;
-}
-
-#else
-
-void WelsSleep (uint32_t dwMilliseconds) {
-  usleep (dwMilliseconds * 1000);	// microseconds
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
-    void* arg, WELS_THREAD_ATTR attr) {
-  WELS_THREAD_ERROR_CODE err = 0;
-
-  pthread_attr_t at;
-  err = pthread_attr_init (&at);
-  if (err)
-    return err;
-#ifndef __ANDROID__
-  err = pthread_attr_setscope (&at, PTHREAD_SCOPE_SYSTEM);
-  if (err)
-    return err;
-  err = pthread_attr_setschedpolicy (&at, SCHED_FIFO);
-  if (err)
-    return err;
-#endif
-  err = pthread_create (thread, &at, routine, arg);
-
-  pthread_attr_destroy (&at);
-
-  return err;
-}
-
-WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread) {
-  return pthread_join (thread, NULL);
-}
-
-WELS_THREAD_HANDLE        WelsThreadSelf() {
-  return pthread_self();
-}
-
-// unnamed semaphores aren't supported on OS X
-
-WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT* p_event, const char* event_name) {
-#ifdef __APPLE__
-  if (p_event == NULL || event_name == NULL)
-    return WELS_THREAD_ERROR_GENERAL;
-  *p_event = sem_open (event_name, O_CREAT, (S_IRUSR | S_IWUSR)/*0600*/, 0);
-  if (*p_event == (sem_t*)SEM_FAILED) {
-    sem_unlink (event_name);
-    *p_event = NULL;
-    return WELS_THREAD_ERROR_GENERAL;
-  } else {
-    return WELS_THREAD_ERROR_OK;
-  }
-#else
-  WELS_EVENT event = (WELS_EVENT) malloc(sizeof(*event));
-  if (event == NULL)
-    return WELS_THREAD_ERROR_GENERAL;
-  WELS_THREAD_ERROR_CODE err = sem_init(event, 0, 0);
-  if (!err) {
-    *p_event = event;
-    return err;
-  }
-  free(event);
-  return err;
-#endif
-}
-WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, const char* event_name) {
-#ifdef __APPLE__
-  WELS_THREAD_ERROR_CODE err = sem_close (*event);	// match with sem_open
-  if (event_name)
-    sem_unlink (event_name);
-  return err;
-#else
-  WELS_THREAD_ERROR_CODE err = sem_destroy (*event);	// match with sem_init
-  free(*event);
-  return err;
-#endif
-}
-
-WELS_THREAD_ERROR_CODE   WelsEventSignal (WELS_EVENT* event) {
-  WELS_THREAD_ERROR_CODE err = 0;
-//	int32_t val = 0;
-//	sem_getvalue(event, &val);
-//	fprintf( stderr, "before signal it, val= %d..\n",val );
-  err = sem_post (*event);
-//	sem_getvalue(event, &val);
-//	fprintf( stderr, "after signal it, val= %d..\n",val );
-  return err;
-}
-
-WELS_THREAD_ERROR_CODE   WelsEventWait (WELS_EVENT* event) {
-  return sem_wait (*event);	// blocking until signaled
-}
-
-WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
-  if (dwMilliseconds != (uint32_t) - 1) {
-    return sem_wait (*event);
-  } else {
-#if defined(__APPLE__)
-    int32_t err = 0;
-    int32_t wait_count = 0;
-    do {
-      err = sem_trywait (*event);
-      if (WELS_THREAD_ERROR_OK == err)
-        break;// WELS_THREAD_ERROR_OK;
-      else if (wait_count > 0)
-        break;
-      usleep (dwMilliseconds * 1000);
-      ++ wait_count;
-    } while (1);
-    return err;
-#else
-    struct timespec ts;
-    struct timeval tv;
-
-    gettimeofday (&tv, 0);
-
-    ts.tv_nsec = tv.tv_usec * 1000 + dwMilliseconds * 1000000;
-    ts.tv_sec = tv.tv_sec + ts.tv_nsec / 1000000000;
-    ts.tv_nsec %= 1000000000;
-
-    return sem_timedwait (*event, &ts);
-#endif//__APPLE__
-  }
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
-    WELS_EVENT* event_list, WELS_EVENT* master_event) {
-  uint32_t nIdx = 0;
-  uint32_t uiAccessTime = 2;	// 2 us once
-
-  if (nCount == 0)
-    return WELS_THREAD_ERROR_WAIT_FAILED;
-
-  if (master_event != NULL) {
-    // This design relies on the events actually being semaphores;
-    // if multiple events in the list have been signalled, the master
-    // event should have a similar count (events in windows can't keep
-    // track of the actual count, but the master event isn't needed there
-    // since it uses WaitForMultipleObjects).
-    int32_t err = sem_wait (*master_event);
-    if (err != WELS_THREAD_ERROR_OK)
-      return err;
-    uiAccessTime = 0; // no blocking, just quickly loop through all to find the one that was signalled
-  }
-
-  while (1) {
-    nIdx = 0;	// access each event by order
-    while (nIdx < nCount) {
-      int32_t err = 0;
-      int32_t wait_count = 0;
-
-      /*
-       * although such interface is not used in __GNUC__ like platform, to use
-       * pthread_cond_timedwait() might be better choice if need
-       */
-      do {
-        err = sem_trywait (event_list[nIdx]);
-        if (WELS_THREAD_ERROR_OK == err)
-          return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
-        else if (wait_count > 0 || uiAccessTime == 0)
-          break;
-        usleep (uiAccessTime);
-        ++ wait_count;
-      } while (1);
-      // we do need access next event next time
-      ++ nIdx;
-    }
-    usleep (1);	// switch to working threads
-    if (master_event != NULL) {
-      // A master event was used and was signalled, but none of the events in the
-      // list was found to be signalled, thus wait a little more when rechecking
-      // the list to avoid busylooping here.
-      // If we ever hit this codepath it's mostly a bug in the code that signals
-      // the events.
-      uiAccessTime = 2;
-    }
-  }
-
-  return WELS_THREAD_ERROR_WAIT_FAILED;
-}
-
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
-    WELS_EVENT* event_list, WELS_EVENT* master_event) {
-  uint32_t nIdx = 0;
-  uint32_t uiCountSignals = 0;
-  uint32_t uiSignalFlag	= 0;	// UGLY: suppose maximal event number up to 32
-
-  if (nCount == 0 || nCount > (sizeof (uint32_t) << 3))
-    return WELS_THREAD_ERROR_WAIT_FAILED;
-
-  while (1) {
-    nIdx = 0;	// access each event by order
-    while (nIdx < nCount) {
-      const uint32_t kuiBitwiseFlag = (1 << nIdx);
-
-      if ((uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag) { // non-blocking mode
-        int32_t err = 0;
-//				fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
-        if (master_event == NULL) {
-          err = sem_wait (event_list[nIdx]);
-        } else {
-          err = sem_wait (*master_event);
-          if (err == WELS_THREAD_ERROR_OK) {
-            err = sem_wait (event_list[nIdx]);
-            if (err != WELS_THREAD_ERROR_OK) {
-              // We successfully waited for the master event,
-              // but waiting for the individual event failed (e.g. EINTR?).
-              // Increase the master event count so that the next retry will
-              // work as intended.
-              sem_post (*master_event);
-            }
-          }
-        }
-//				fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
-        if (WELS_THREAD_ERROR_OK == err) {
-//					int32_t val = 0;
-//					sem_getvalue(&event_list[nIdx], &val);
-//					fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
-
-          uiSignalFlag |= kuiBitwiseFlag;
-          ++ uiCountSignals;
-          if (uiCountSignals >= nCount) {
-            return WELS_THREAD_ERROR_OK;
-          }
-        }
-      }
-      // we do need access next event next time
-      ++ nIdx;
-    }
-  }
-
-  return WELS_THREAD_ERROR_WAIT_FAILED;
-}
-
-WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
-#ifdef ANDROID_NDK
-  pInfo->ProcessorCount = android_getCpuCount();
-  return WELS_THREAD_ERROR_OK;
-#elif defined(LINUX)
-
-  cpu_set_t cpuset;
-
-  CPU_ZERO (&cpuset);
-
-  if (!sched_getaffinity (0, sizeof (cpuset), &cpuset))
-    pInfo->ProcessorCount = CPU_COUNT (&cpuset);
-  else
-    pInfo->ProcessorCount = 1;
-
-  return WELS_THREAD_ERROR_OK;
-
-#else
-
-  size_t len = sizeof (pInfo->ProcessorCount);
-
-  if (sysctlbyname (HW_NCPU_NAME, &pInfo->ProcessorCount, &len, NULL, 0) == -1)
-    pInfo->ProcessorCount = 1;
-
-  return WELS_THREAD_ERROR_OK;
-
-#endif//LINUX
-}
-
-#endif
-
-
-#endif // MT_ENABLED
-
--- a/codec/common/WelsThreadLib.h
+++ /dev/null
@@ -1,132 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	WelsThreadLib.h
- *
- * \brief	Interfaces introduced in thread programming
- *
- * \date	11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef   _WELS_THREAD_API_H_
-#define   _WELS_THREAD_API_H_
-
-#include "typedefs.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-typedef    HANDLE                    WELS_THREAD_HANDLE;
-typedef    LPTHREAD_START_ROUTINE    LPWELS_THREAD_ROUTINE;
-
-typedef    CRITICAL_SECTION          WELS_MUTEX;
-typedef    HANDLE                    WELS_EVENT;
-
-#define    WELS_THREAD_ROUTINE_TYPE         DWORD  WINAPI
-#define    WELS_THREAD_ROUTINE_RETURN(rc)   return (DWORD)rc;
-
-#else	// NON-WINDOWS
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <pthread.h>
-#include <semaphore.h>
-#include <signal.h>
-#include <errno.h>
-#include <time.h>
-#include <sys/time.h>
-
-#include <sys/stat.h>
-#include <fcntl.h>
-
-typedef   pthread_t    WELS_THREAD_HANDLE;
-typedef  void* (*LPWELS_THREAD_ROUTINE) (void*);
-
-typedef   pthread_mutex_t           WELS_MUTEX;
-typedef   sem_t*                    WELS_EVENT;
-
-#define   WELS_THREAD_ROUTINE_TYPE         void *
-#define   WELS_THREAD_ROUTINE_RETURN(rc)   return (void*)(intptr_t)rc;
-
-#endif//_WIN32
-
-typedef    int32_t        WELS_THREAD_ERROR_CODE;
-typedef    int32_t        WELS_THREAD_ATTR;
-
-typedef  struct _WelsLogicalProcessorInfo {
-  int32_t    ProcessorCount;
-} WelsLogicalProcessInfo;
-
-#define    WELS_THREAD_ERROR_OK					0
-#define    WELS_THREAD_ERROR_GENERAL			((uint32_t)(-1))
-#define    WELS_THREAD_ERROR_WAIT_OBJECT_0		0
-#define	   WELS_THREAD_ERROR_WAIT_TIMEOUT		((uint32_t)0x00000102L)
-#define	   WELS_THREAD_ERROR_WAIT_FAILED		WELS_THREAD_ERROR_GENERAL
-
-void WelsSleep (uint32_t dwMilliseconds);
-WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex);
-WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex);
-WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex);
-
-WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT* p_event, const char* event_name);
-WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, const char* event_name);
-WELS_THREAD_ERROR_CODE    WelsEventSignal (WELS_EVENT* event);
-WELS_THREAD_ERROR_CODE    WelsEventWait (WELS_EVENT* event);
-WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds);
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount, WELS_EVENT* event_list,
-    WELS_EVENT* master_event = NULL);
-WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT* event_list,
-    WELS_EVENT* master_event = NULL);
-
-WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
-    void* arg, WELS_THREAD_ATTR attr);
-
-WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread);
-
-WELS_THREAD_HANDLE        WelsThreadSelf();
-
-WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo);
-
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif
--- /dev/null
+++ b/codec/common/arm/arm_arch_common_macro.S
@@ -1,0 +1,64 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef __APPLE__
+
+.macro WELS_ASM_FUNC_BEGIN
+.align 2
+.arm
+.globl _$0
+_$0:
+.endm
+
+.macro WELS_ASM_FUNC_END
+mov pc, lr
+.endm
+#else
+
+.syntax unified
+.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
+.text
+
+.macro WELS_ASM_FUNC_BEGIN funcName
+.align 2
+.arm
+.global \funcName
+.type \funcName, %function
+.func \funcName
+\funcName:
+.endm
+
+.macro WELS_ASM_FUNC_END
+mov pc, lr
+.endfunc
+.endm
+#endif
--- /dev/null
+++ b/codec/common/arm/deblocking_neon.S
@@ -1,0 +1,1052 @@
+/*!
+* \copy
+*     Copyright (c)  2013, Cisco Systems
+*     All rights reserved.
+
+*     Redistribution and use in source and binary forms, with or without
+*     modification, are permitted provided that the following conditions
+*     are met:
+
+*        * Redistributions of source code must retain the above copyright
+*          notice, this list of conditions and the following disclaimer.
+
+*        * Redistributions in binary form must reproduce the above copyright
+*          notice, this list of conditions and the following disclaimer in
+*          the documentation and/or other materials provided with the
+*          distribution.
+
+*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+*     POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifdef HAVE_NEON
+.text
+
+#include "arm_arch_common_macro.S"
+
+#ifdef __APPLE__
+.macro	JMP_IF_128BITS_IS_ZERO
+    vorr.s16	$2, $0, $1
+    vmov		r3, r2, $2
+    orr			r3, r3, r2
+    cmp			r3, #0
+.endm
+
+.macro	MASK_MATRIX
+    vabd.u8	$6, $1, $2
+    vcgt.u8	$6, $4, $6
+
+    vabd.u8	$4, $0, $1
+    vclt.u8	$4, $4, $5
+    vand.u8	$6, $6, $4
+
+    vabd.u8	$4, $3, $2
+    vclt.u8	$4, $4, $5
+    vand.u8	$6, $6, $4
+.endm
+
+
+.macro	DIFF_LUMA_LT4_P1_Q1
+    vabd.u8	$9, $0, $2
+    vclt.u8	$9, $9, $4
+    vrhadd.u8	$8, $2, $3
+    vhadd.u8	$8, $0, $8
+    vsub.s8	$8, $8, $1
+    vmax.s8	$8, $8, $5
+    vmin.s8	$8, $8, $6
+    vand.s8	$8, $8, $9
+    vand.s8	$8, $8, $7
+    vadd.u8	$8, $1, $8
+    vabs.s8	$9, $9
+.endm
+
+.macro	DIFF_LUMA_LT4_P0_Q0
+    vsubl.u8	$5, $0, $3
+    vsubl.u8	$6, $2, $1
+    vshl.s16	$6, $6, #2
+    vadd.s16	$5, $5, $6
+    vrshrn.s16		$4, $5, #3
+.endm
+
+.macro	DIFF_LUMA_EQ4_P2P1P0
+    vaddl.u8	q4, $1, $2
+    vaddl.u8	q5, $3, $4
+    vadd.u16	q5, q4, q5
+
+    vaddl.u8	q4, $0, $1
+    vshl.u16	q4, q4, #1
+    vadd.u16	q4, q5, q4
+
+    vrshrn.u16		$0, q5, #2
+    vrshrn.u16		$7, q4, #3
+
+    vshl.u16	q5, q5, #1
+    vsubl.u8	q4, $5, $1
+    vadd.u16	q5, q4,q5
+
+    vaddl.u8	q4, $2, $5
+    vaddw.u8	q4, q4, $2
+    vaddw.u8	q4, q4, $3
+
+    vrshrn.u16		d10,q5, #3
+    vrshrn.u16		d8, q4, #2
+    vbsl.u8		$6, d10, d8
+.endm
+
+.macro	DIFF_LUMA_EQ4_MASK
+    vmov	$3, $2
+    vbsl.u8	$3, $0, $1
+.endm
+
+.macro	DIFF_CHROMA_EQ4_P0Q0
+    vaddl.u8	$4, $0, $3
+    vaddw.u8	$5, $4, $1
+    vaddw.u8	$6, $4, $2
+    vaddw.u8	$5, $5, $0
+
+    vaddw.u8	$6, $6, $3
+    vrshrn.u16		$7, $5, #2
+    vrshrn.u16		$8, $6, #2
+.endm
+
+.macro	LOAD_CHROMA_DATA_4
+    vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+    vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.endm
+
+.macro	STORE_CHROMA_DATA_4
+    vst4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
+    vst4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
+.endm
+
+.macro	LOAD_LUMA_DATA_3
+    vld3.u8	{$0[$6],$1[$6],$2[$6]}, [r2], r1
+    vld3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
+.endm
+
+.macro	STORE_LUMA_DATA_4
+    vst4.u8	{$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
+    vst4.u8	{$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
+.endm
+
+.macro	LOAD_LUMA_DATA_4
+    vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1
+    vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1
+.endm
+
+.macro	STORE_LUMA_DATA_3
+    vst3.u8	{$0[$6],$1[$6],$2[$6]}, [r3], r1
+    vst3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
+.endm
+
+.macro	EXTRACT_DELTA_INTO_TWO_PART
+    vcge.s8	$1, $0, #0
+    vand	$1, $0, $1
+    vsub.s8	$0, $1, $0
+.endm
+#else
+.macro	JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
+    vorr.s16	\arg2, \arg0, \arg1
+    vmov		r3, r2, \arg2
+    orr			r3, r3, r2
+    cmp			r3, #0
+.endm
+
+.macro	MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vabd.u8	\arg6, \arg1, \arg2
+    vcgt.u8	\arg6, \arg4, \arg6
+
+    vabd.u8	\arg4, \arg0, \arg1
+    vclt.u8	\arg4, \arg4, \arg5
+    vand.u8	\arg6, \arg6, \arg4
+
+    vabd.u8	\arg4, \arg3, \arg2
+    vclt.u8	\arg4, \arg4, \arg5
+    vand.u8	\arg6, \arg6, \arg4
+.endm
+
+.macro	DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
+    vabd.u8	\arg9, \arg0, \arg2
+    vclt.u8	\arg9, \arg9, \arg4
+    vrhadd.u8	\arg8, \arg2, \arg3
+    vhadd.u8	\arg8, \arg0, \arg8
+    vsub.s8	\arg8, \arg8, \arg1
+    vmax.s8	\arg8, \arg8, \arg5
+    vmin.s8	\arg8, \arg8, \arg6
+    vand.s8	\arg8, \arg8, \arg9
+    vand.s8	\arg8, \arg8, \arg7
+    vadd.u8	\arg8, \arg1, \arg8
+    vabs.s8	\arg9, \arg9
+.endm
+
+.macro	DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vsubl.u8	\arg5, \arg0, \arg3
+    vsubl.u8	\arg6, \arg2, \arg1
+    vshl.s16	\arg6, \arg6, #2
+    vadd.s16	\arg5, \arg5, \arg6
+    vrshrn.s16		\arg4, \arg5, #3
+.endm
+
+
+.macro	DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
+    vaddl.u8	q4, \arg1, \arg2
+    vaddl.u8	q5, \arg3, \arg4
+    vadd.u16	q5, q4, q5
+
+    vaddl.u8	q4, \arg0, \arg1
+    vshl.u16	q4, q4, #1
+    vadd.u16	q4, q5, q4
+
+    vrshrn.u16		\arg0, q5, #2
+    vrshrn.u16		\arg7, q4, #3
+
+    vshl.u16	q5, q5, #1
+    vsubl.u8	q4, \arg5, \arg1
+    vadd.u16	q5, q4,q5
+
+    vaddl.u8	q4, \arg2, \arg5
+    vaddw.u8	q4, q4, \arg2
+    vaddw.u8	q4, q4, \arg3
+
+    vrshrn.u16		d10,q5, #3
+    vrshrn.u16		d8, q4, #2
+    vbsl.u8		\arg6, d10, d8
+.endm
+
+.macro	DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
+    vmov	\arg3, \arg2
+    vbsl.u8	\arg3, \arg0, \arg1
+.endm
+
+.macro	DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vaddl.u8	\arg4, \arg0, \arg3
+    vaddw.u8	\arg5, \arg4, \arg1
+    vaddw.u8	\arg6, \arg4, \arg2
+    vaddw.u8	\arg5, \arg5, \arg0
+    vaddw.u8	\arg6, \arg6, \arg3
+    vrshrn.u16		\arg7, \arg5, #2
+    vrshrn.u16		\arg8, \arg6, #2
+.endm
+
+.macro	LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+    vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.endm
+
+.macro	STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vst4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
+    vst4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
+.endm
+
+.macro	LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vld3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
+    vld3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.endm
+
+.macro	STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
+    vst4.u8	{\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
+    vst4.u8	{\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
+.endm
+
+.macro	LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+    vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1
+    vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1
+.endm
+
+.macro	STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vst3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
+    vst3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
+.endm
+
+.macro	EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
+    vcge.s8	\arg1, \arg0, #0
+    vand	\arg1, \arg0, \arg1
+    vsub.s8	\arg0, \arg1, \arg0
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
+    vpush	{q4-q7}
+    vdup.u8	q11, r2
+    vdup.u8	q9, r3
+
+    add			r2, r1, r1, lsl #1
+    sub			r2, r0, r2
+    vld1.u8	{q0}, [r2], r1
+    vld1.u8	{q3}, [r0], r1
+    vld1.u8	{q1}, [r2], r1
+    vld1.u8	{q4}, [r0], r1
+    vld1.u8	{q2}, [r2]
+    vld1.u8	{q5}, [r0]
+    sub			r2, r2, r1
+
+    ldr			r3, [sp, #64]
+    vld1.s8	{d31}, [r3]
+    vdup.s8	d28, d31[0]
+    vdup.s8	d30, d31[1]
+    vdup.s8	d29, d31[2]
+    vdup.s8	d31, d31[3]
+    vtrn.32	d28, d30
+    vtrn.32	d29, d31
+    vcge.s8	q10, q14, #0
+
+    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
+    vand.u8	q10, q10, q15
+
+    veor		q15, q15
+    vsub.i8	q15,q15,q14
+
+    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+    vst1.u8	{q6}, [r2], r1
+
+    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+
+    vabs.s8	q12, q12
+    vabs.s8	q13, q13
+    vadd.u8	q14,q14,q12
+    vadd.u8	q14,q14,q13
+    veor		q15, q15
+    vsub.i8	q15,q15,q14
+
+    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
+    vmax.s8	q8, q8, q15
+    vmin.s8	q8, q8, q14
+    vand.s8	q8, q8, q10
+    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
+    vqadd.u8	q2, q2, q9
+    vqsub.u8	q2, q2, q8
+    vst1.u8	{q2}, [r2], r1
+    vqsub.u8	q3, q3, q9
+    vqadd.u8	q3, q3, q8
+    vst1.u8	{q3}, [r2]	, r1
+    vst1.u8	{q7}, [r2]
+
+    vpop	{q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
+    vpush	{q4-q7}
+
+    vdup.u8	q5, r2
+    vdup.u8	q4, r3
+
+    sub			r3, r0, r1, lsl #2
+    vld1.u8	{q8},  [r3], r1
+    vld1.u8	{q12}, [r0], r1
+    vld1.u8	{q9},  [r3], r1
+    vld1.u8	{q13}, [r0], r1
+    vld1.u8	{q10}, [r3], r1
+    vld1.u8	{q14}, [r0], r1
+    vld1.u8	{q11}, [r3]
+    vld1.u8	{q15}, [r0]
+    sub			r3, r3, r1	, lsl #1
+
+    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
+
+    mov			r2, r2, lsr #2
+    add			r2, r2, #2
+    vdup.u8	q5, r2
+    vabd.u8	q0, q11, q12
+    vclt.u8	q7, q0, q5
+
+    vabd.u8	q1, q9, q11
+    vclt.u8	q1, q1, q4
+    vand.s8	q1, q1, q7
+
+    vabd.u8	q2, q14,q12
+    vclt.u8	q2, q2, q4
+    vand.s8	q2, q2, q7
+    vand.u8	q7, q7, q6
+
+    vmov		q3, q1
+
+    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
+    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
+
+    vand.u8	q3, q7, q3
+    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
+    vst1.u8	{q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK	q8,q10, q3, q4
+    vst1.u8	{q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q4
+    vst1.u8	{q4}, [r3], r1
+
+    vmov		q0, q2
+    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d6
+    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d7
+
+    vand.u8	q0, q7, q0
+    DIFF_LUMA_EQ4_MASK	q2,  q12, q6, q4
+    vst1.u8	{q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK	q15, q13, q0, q4
+    vst1.u8	{q4}, [r3], r1
+    DIFF_LUMA_EQ4_MASK	q3,  q14, q0, q4
+    vst1.u8	{q4}, [r3], r1
+
+    vpop	{q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
+    vpush	{q4-q7}
+
+    vdup.u8	q11, r2
+    vdup.u8	q9, r3
+
+    sub			r2, r0, #3
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 0
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 1
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 2
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 3
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 4
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 5
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 6
+    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 7
+
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 0
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 1
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 2
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 3
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 4
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 5
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 6
+    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 7
+
+    vswp		d1, d2
+    vswp		d3, d4
+    vswp		d1, d4
+    vswp		d7, d8
+    vswp		d9, d10
+    vswp		d7, d10
+
+    sub			r0, r0, r1, lsl #4
+
+    ldr			r3, [sp, #64]
+    vld1.s8	{d31}, [r3]
+    vdup.s8	d28, d31[0]
+    vdup.s8	d30, d31[1]
+    vdup.s8	d29, d31[2]
+    vdup.s8	d31, d31[3]
+    vtrn.32	d28, d30
+    vtrn.32	d29, d31
+    vcge.s8	q10, q14, #0
+
+    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
+    vand.u8	q10, q10, q15
+
+    veor		q15, q15
+    vsub.i8	q15,q15,q14
+
+    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
+    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
+
+    vabs.s8	q12, q12
+    vabs.s8	q13, q13
+    vadd.u8	q14,q14,q12
+    vadd.u8	q14,q14,q13
+    veor		q15, q15
+    vsub.i8	q15,q15,q14
+
+    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
+    vmax.s8	q8, q8, q15
+    vmin.s8	q8, q8, q14
+    vand.s8	q8, q8, q10
+    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
+    vqadd.u8	q2, q2, q9
+    vqsub.u8	q2, q2, q8
+
+    vqsub.u8	q3, q3, q9
+    vqadd.u8	q3, q3, q8
+
+    sub		r0, #2
+    add		r2, r0, r1
+    lsl		r1, #1
+
+    vmov		q1, q6
+    vmov		q4, q7
+
+    vswp		q2, q3
+    vswp		d3, d6
+    vswp		d5, d8
+
+    STORE_LUMA_DATA_4		d2, d3, d4, d5, 0, 1
+    STORE_LUMA_DATA_4		d2, d3, d4, d5, 2, 3
+    STORE_LUMA_DATA_4		d2, d3, d4, d5, 4, 5
+    STORE_LUMA_DATA_4		d2, d3, d4, d5, 6, 7
+
+    STORE_LUMA_DATA_4		d6, d7, d8, d9, 0, 1
+    STORE_LUMA_DATA_4		d6, d7, d8, d9, 2, 3
+    STORE_LUMA_DATA_4		d6, d7, d8, d9, 4, 5
+    STORE_LUMA_DATA_4		d6, d7, d8, d9, 6, 7
+
+    vpop	{q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
+    vpush	{q4-q7}
+    vdup.u8	q5, r2
+    vdup.u8	q4, r3
+
+    sub			r3, r0, #4				//	pix -= 4
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,0
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,1
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,2
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,3
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,4
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,5
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,6
+    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,7
+
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,0
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,1
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,2
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,3
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,4
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,5
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,6
+    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,7
+
+    vswp		q9, q10
+    vswp		d17,d18
+    vswp		d21,d22
+    vswp		q13,q14
+    vswp		d25,d26
+    vswp		d29,d30
+    sub			r0, r0, r1	, lsl #4
+
+    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
+
+    mov			r2, r2, lsr #2
+    add			r2, r2, #2
+    vdup.u8	q5, r2
+    vabd.u8	q0, q11, q12
+    vclt.u8	q7, q0, q5
+
+    vabd.u8	q1, q9, q11
+    vclt.u8	q1, q1, q4
+    vand.s8	q1, q1, q7
+
+    vabd.u8	q2, q14,q12
+    vclt.u8	q2, q2, q4
+    vand.s8	q2, q2, q7
+    vand.u8	q7, q7, q6
+
+    vmov		q3, q1
+
+    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
+    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
+
+    vand.u8	q3, q7, q3
+    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
+    vmov		q9, q4
+    vbsl.u8	q3, q8, q10
+    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q8
+
+    vand.u8	q7, q7, q2
+
+    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d0
+    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d1
+
+    vbsl.u8	q6, q2, q12
+    DIFF_LUMA_EQ4_MASK	q15, q13, q7, q4
+
+    vbsl.u8	q7, q0, q14
+
+    vmov		q5, q6
+    vmov		q2, q9
+    vmov		q6, q4
+    vmov		q4, q8
+
+    vswp	d8, d6
+    vswp	d5, d7
+    vswp	d5, d8
+    vswp	d14, d12
+    vswp	d11, d13
+    vswp	d11, d14
+
+    sub		r3, r0, #3
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,0
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,1
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,2
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,3
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,4
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,5
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,6
+    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,7
+
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,0
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,1
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,2
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,3
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,4
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,5
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,6
+    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,7
+
+    vpop	{q4-q7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
+    vdup.u8	q11, r3
+    ldr			r3, [sp, #0]
+
+    sub			r0, r0, r2	, lsl #1
+    sub			r1, r1, r2, lsl #1
+    vdup.u8	    q9, r3
+    ldr			r3, [sp, #4]
+
+    vld1.u8	{d0}, [r0], r2
+    vld1.u8	{d1}, [r1], r2
+    vld1.u8	{d2}, [r0], r2
+    vld1.u8	{d3}, [r1], r2
+    vld1.u8	{d4}, [r0], r2
+    vld1.u8	{d5}, [r1], r2
+    vld1.u8	{d6}, [r0]
+    vld1.u8	{d7}, [r1]
+
+    sub			r0, r0, r2, lsl #1
+    sub			r1, r1, r2, lsl #1
+
+    vld1.s8	{d31}, [r3]
+    vmovl.u8	q14,d31
+    vshl.u64	d29,d28,#8
+    vorr		d28,d29
+    vmov		d29, d28
+    veor		q15, q15
+    vsub.i8	q15,q15,q14
+
+    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+
+    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
+    vmax.s8	q8, q8, q15
+    vmin.s8	q8, q8, q14
+
+    vand.s8	q8, q8, q10
+    vcge.s8	q14, q14, #0
+    vand.s8	q8, q8, q14
+    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
+    vqadd.u8	q1, q1, q10
+    vqsub.u8	q1, q1, q8
+    vst1.u8	{d2}, [r0], r2
+    vst1.u8	{d3}, [r1], r2
+    vqsub.u8	q2, q2, q10
+    vqadd.u8	q2, q2, q8
+    vst1.u8	{d4}, [r0]
+    vst1.u8	{d5}, [r1]
+
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
+    vpush	{q4-q5}
+
+    vdup.u8	q11, r3
+    ldr			r3, [sp, #32]
+
+    sub			r0, r0, r2	, lsl #1
+    sub			r1, r1, r2, lsl #1
+    vdup.u8	q9, r3
+    vld1.u8	{d0}, [r0], r2		//	q0::p1
+    vld1.u8	{d1}, [r1], r2
+    vld1.u8	{d2}, [r0], r2		//	q1::p0
+    vld1.u8	{d3}, [r1], r2
+    vld1.u8	{d4}, [r0], r2		//	q2::q0
+    vld1.u8	{d5}, [r1], r2
+    vld1.u8	{d6}, [r0]				//	q3::q1
+    vld1.u8	{d7}, [r1]
+
+    sub			r0, r0, r2, lsl #1	//	pix = [-1*src_stride]
+    sub			r1, r1, r2, lsl #1
+
+    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+
+    vmov			q11, q10
+
+    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q4, q5, q8, d30, d0		// Cb::p0' q0'
+    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q12, q13, q14, d31, d1	// Cr::p0' q0'
+
+    vbsl.u8	q10, q15, q1
+    vst1.u8	{d20}, [r0], r2
+    vst1.u8	{d21}, [r1], r2
+
+    vbsl.u8	q11, q0, q2
+    vst1.u8	{d22}, [r0]
+    vst1.u8	{d23}, [r1]
+
+    vpop	{q4-q5}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
+
+    vdup.u8	q11, r3
+    ldr			r3, [sp, #0]
+
+    sub			r0, r0, #2
+    vdup.u8	q9, r3
+    ldr			r3, [sp, #4]
+    sub			r1, r1, #2
+    vld1.s8	{d31}, [r3]
+
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vswp		q1, q2
+    vswp		d1, d2
+    vswp		d6, d5
+
+    vmovl.u8	q14, d31
+    vshl.u64	d29,d28,#8
+    vorr		d28,d29
+    vmov		d29, d28
+    veor		q15, q15
+    vsub.i8	q15,q15,q14
+
+    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+
+    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
+    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
+    vmax.s8	q8, q8, q15
+    vmin.s8	q8, q8, q14
+
+    vand.s8	q8, q8, q10
+    vcge.s8	q14, q14, #0
+    vand.s8	q8, q8, q14
+    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
+    vqadd.u8	q1, q1, q10
+    vqsub.u8	q1, q1, q8
+    vqsub.u8	q2, q2, q10
+    vqadd.u8	q2, q2, q8
+
+    sub			r0, r0, r2, lsl #3
+    sub			r1, r1, r2, lsl #3
+    vswp		d1, d2
+    vswp		d6, d5
+    vswp		q1, q2
+
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
+    vpush	{q4-q5}
+    vdup.u8	q11, r3
+    ldr			r3, [sp, #32]
+
+    sub			r0, r0, #2
+    sub			r1, r1, #2
+
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
+    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+    vswp		q1, q2
+    vswp		d1, d2
+    vswp		d6, d5
+
+    vdup.u8	q9, r3
+    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
+    vmov			q11, q10
+
+    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q8, q9, q12, d8, d10
+    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q13, q14, q15, d9, d11
+
+    vbsl.u8	q10, q4, q1
+    vbsl.u8	q11, q5, q2
+    sub			r0, r0, r2, lsl #3	//	pix: 0th row	[-2]
+    sub			r1, r1, r2, lsl #3
+
+    vmov		q1, q10
+    vmov		q2, q11
+    vswp		d1, d2
+    vswp		d6, d5
+    vswp		q1, q2
+    //	Cb:d0d1d2d3, Cr:d4d5d6d7
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
+    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
+
+    vpop	{q4-q5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
+
+    vld1.64	{d0-d2}, [r0]
+
+    vceq.s8	q0, q0, #0
+    vceq.s8	d2, d2, #0
+    vmvn	q0, q0
+    vmvn	d2, d2
+    vabs.s8	q0, q0
+    vabs.s8	d2, d2
+
+    vst1.64	{d0-d2}, [r0]
+WELS_ASM_FUNC_END
+
+#ifdef __APPLE__
+.macro BS_NZC_CHECK
+    vld1.8   {d0,d1}, [$0]
+    /* Arrenge the input data --- TOP */
+	ands     r6, $1, #2
+	beq      bs_nzc_check_jump0
+
+    sub      r6, $0, $2, lsl #4
+	sub      r6, $2, lsl #3
+    add      r6, #12
+    vld1.32  d3[1], [r6]
+
+bs_nzc_check_jump0:
+    vext.8   q1, q1, q0, #12
+	vadd.u8  $3, q0, q1
+
+
+    /* Arrenge the input data --- LEFT */
+	ands     r6, $1, #1
+	beq      bs_nzc_check_jump1
+
+    sub      r6, $0, #21
+	add      r7, r6, #4
+    vld1.8   d3[4], [r6]
+	add      r6, r7, #4
+    vld1.8   d3[5], [r7]
+	add      r7, r6, #4
+    vld1.8   d3[6], [r6]
+    vld1.8   d3[7], [r7]
+
+bs_nzc_check_jump1:
+	vzip.8   d0, d1
+	vzip.8   d0, d1
+    vext.8   q1, q1, q0, #12
+	vadd.u8  $4, q0, q1
+.endm
+
+.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
+    mov       r6, #4
+    vabd.s16  q8, $0, $1
+    vabd.s16  q9, $1, $2
+	vdup.s16  $0, r6
+    vabd.s16  q10, $2, $3
+    vabd.s16  q11, $3, $4
+
+    vcge.s16  q8, $0
+    vcge.s16  q9, $0
+    vcge.s16  q10, $0
+    vcge.s16  q11, $0
+
+	vpadd.i16 d16, d16, d17
+    vpadd.i16 d17, d18, d19
+    vpadd.i16 d18, d20, d21
+    vpadd.i16 d19, d22, d23
+
+    vaddhn.i16  $5, q8, q8
+    vaddhn.i16  $6, q9, q9
+.endm
+
+.macro BS_MV_CHECK
+    vldm   $0, {q0,q1,q2,q3}
+
+    /* Arrenge the input data --- TOP */
+	ands     r6, $1, #2
+	beq      bs_mv_check_jump0
+
+    sub      r6, $0, $2, lsl #6
+    add      r6, #48
+    vld1.8   {d8, d9}, [r6]
+
+bs_mv_check_jump0:
+    BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
+
+    /* Arrenge the input data --- LEFT */
+	ands     r6, $1, #1
+	beq      bs_mv_check_jump1
+
+    sub      r6, $0, #52
+    add      r7, r6, #16
+	vld1.32   d8[0], [r6]
+	add      r6, r7, #16
+    vld1.32   d8[1], [r7]
+	add      r7, r6, #16
+    vld1.32   d9[0], [r6]
+    vld1.32   d9[1], [r7]
+
+bs_mv_check_jump1:
+	vzip.32   q0, q2
+	vzip.32   q1, q3
+	vzip.32   q0, q1
+    vzip.32   q2, q3
+    BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
+.endm
+#else
+.macro BS_NZC_CHECK  arg0, arg1, arg2, arg3, arg4
+    vld1.8   {d0,d1}, [\arg0]
+    /* Arrenge the input data --- TOP */
+    ands     r6, \arg1, #2
+    beq      bs_nzc_check_jump0
+
+    sub      r6, \arg0, \arg2, lsl #4
+    sub      r6, r6, \arg2, lsl #3
+    add      r6, #12
+    vld1.32  d3[1], [r6]
+
+bs_nzc_check_jump0:
+    vext.8   q1, q1, q0, #12
+    vadd.u8  \arg3, q0, q1
+
+
+    /* Arrenge the input data --- LEFT */
+    ands     r6, \arg1, #1
+    beq      bs_nzc_check_jump1
+
+    sub      r6, \arg0, #21
+    add      r7, r6, #4
+    vld1.8   d3[4], [r6]
+    add      r6, r7, #4
+    vld1.8   d3[5], [r7]
+    add      r7, r6, #4
+    vld1.8   d3[6], [r6]
+    vld1.8   d3[7], [r7]
+
+bs_nzc_check_jump1:
+    vzip.8   d0, d1
+    vzip.8   d0, d1
+    vext.8   q1, q1, q0, #12
+    vadd.u8  \arg4, q0, q1
+.endm
+
+.macro BS_COMPARE_MV  arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
+    mov       r6, #4
+    vabd.s16  q8, \arg0, \arg1
+    vabd.s16  q9, \arg1, \arg2
+    vdup.s16  \arg0, r6
+    vabd.s16  q10, \arg2, \arg3
+    vabd.s16  q11, \arg3, \arg4
+
+    vcge.s16  q8, \arg0
+    vcge.s16  q9, \arg0
+    vcge.s16  q10, \arg0
+    vcge.s16  q11, \arg0
+
+    vpadd.i16 d16, d16, d17
+    vpadd.i16 d17, d18, d19
+    vpadd.i16 d18, d20, d21
+    vpadd.i16 d19, d22, d23
+
+    vaddhn.i16  \arg5, q8, q8
+    vaddhn.i16  \arg6, q9, q9
+.endm
+
+.macro BS_MV_CHECK  arg0, arg1, arg2, arg3, arg4, arg5, arg6
+    vldm   \arg0, {q0,q1,q2,q3}
+
+    /* Arrenge the input data --- TOP */
+    ands     r6, \arg1, #2
+    beq      bs_mv_check_jump0
+
+    sub      r6, \arg0, \arg2, lsl #6
+    add      r6, #48
+    vld1.8   {d8, d9}, [r6]
+
+bs_mv_check_jump0:
+    BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg3, \arg4
+
+    /* Arrenge the input data --- LEFT */
+    ands     r6, \arg1, #1
+    beq      bs_mv_check_jump1
+
+    sub      r6, \arg0, #52
+    add      r7, r6, #16
+    vld1.32   d8[0], [r6]
+    add      r6, r7, #16
+    vld1.32   d8[1], [r7]
+    add      r7, r6, #16
+    vld1.32   d9[0], [r6]
+    vld1.32   d9[1], [r7]
+
+bs_mv_check_jump1:
+    vzip.32   q0, q2
+    vzip.32   q1, q3
+    vzip.32   q0, q1
+    vzip.32   q2, q3
+    BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg5, \arg6
+.endm
+#endif
+
+
+WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
+
+	stmdb sp!, {r5-r7}
+	vpush {q4}
+
+	ldr  r5, [sp, #28]	//Save BS to r5
+
+	/* Checking the nzc status */
+	BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
+
+	/* For checking bS[I] = 2 */
+	mov      r6, #2
+	vcgt.s8  q14, q14, #0
+	vdup.u8  q0, r6
+	vcgt.s8  q15, q15, #0
+
+	vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
+	vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
+
+	/* Checking the mv status*/
+	BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
+
+	/* For checking bS[I] = 1 */
+    mov      r6, #1
+	vdup.u8  q0, r6
+
+	vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
+	vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
+
+
+	/* Check bS[I] is '1' or '2' */
+	vmax.u8 q1, q12, q14
+	vmax.u8 q0, q13, q15
+
+	//vstm r5, {q0, q1}
+    vst1.32 {q0, q1}, [r5]
+	vpop {q4}
+	ldmia sp!, {r5-r7}
+WELS_ASM_FUNC_END
+#endif
--- /dev/null
+++ b/codec/common/arm/expand_picture_neon.S
@@ -1,0 +1,137 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
+    stmdb sp!, {r4-r8}
+	//Save the dst
+	mov r7, r0
+	mov r8, r3
+
+	add r4, r7, r2
+	sub r4, #1
+    //For the left and right expand
+_expand_picture_luma_loop2:
+	sub r5, r7, #32
+	add r6, r4, #1
+
+	vld1.8 {d0[], d1[]}, [r7], r1
+	vld1.8 {d2[], d3[]}, [r4], r1
+
+	vst1.8 {q0}, [r5]!
+	vst1.8 {q0}, [r5]
+	vst1.8 {q1}, [r6]!
+	vst1.8 {q1}, [r6]
+	subs r8, #1
+	bne	_expand_picture_luma_loop2
+
+	//for the top and bottom expand
+	add r2, #64
+	sub r0, #32
+	mla r4, r1, r3, r0
+	sub r4, r1
+_expand_picture_luma_loop0:
+	mov r5, #32
+    mls r5, r5, r1, r0
+	add r6, r4, r1
+	vld1.8 {q0}, [r0]!
+	vld1.8 {q1}, [r4]!
+
+	mov r8, #32
+_expand_picture_luma_loop1:
+	vst1.8 {q0}, [r5], r1
+	vst1.8 {q1}, [r6], r1
+	subs r8, #1
+    bne _expand_picture_luma_loop1
+
+	subs r2, #16
+	bne	_expand_picture_luma_loop0
+
+    //vldreq.32 d0, [r0]
+
+	ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
+    stmdb sp!, {r4-r8}
+	//Save the dst
+	mov r7, r0
+	mov r8, r3
+
+	add r4, r7, r2
+	sub r4, #1
+    //For the left and right expand
+_expand_picture_chroma_loop2:
+	sub r5, r7, #16
+	add r6, r4, #1
+
+	vld1.8 {d0[], d1[]}, [r7], r1
+	vld1.8 {d2[], d3[]}, [r4], r1
+
+	vst1.8 {q0}, [r5]
+	vst1.8 {q1}, [r6]
+	subs r8, #1
+	bne	_expand_picture_chroma_loop2
+
+	//for the top and bottom expand
+	add r2, #32
+	sub r0, #16
+	mla r4, r1, r3, r0
+	sub r4, r1
+_expand_picture_chroma_loop0:
+	mov r5, #16
+    mls r5, r5, r1, r0
+	add r6, r4, r1
+	vld1.8 {q0}, [r0]!
+	vld1.8 {q1}, [r4]!
+
+	mov r8, #16
+_expand_picture_chroma_loop1:
+	vst1.8 {q0}, [r5], r1
+	vst1.8 {q1}, [r6], r1
+	subs r8, #1
+    bne _expand_picture_chroma_loop1
+
+	subs r2, #16
+	bne	_expand_picture_chroma_loop0
+
+    //vldreq.32 d0, [r0]
+
+	ldmia sp!, {r4-r8}
+WELS_ASM_FUNC_END
+
+#endif
--- /dev/null
+++ b/codec/common/arm/mc_neon.S
@@ -1,0 +1,2210 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifdef  HAVE_NEON
+.text
+#include "arm_arch_common_macro.S"
+
+#ifdef __APPLE__
+.macro	AVERAGE_TWO_8BITS
+//	{	// input:dst_d, src_d A and B; working: q13
+    vaddl.u8	q13, $2, $1
+    vrshrn.u16		$0, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+//	}
+.endm
+
+.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
+//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
+    vrev64.8	$2, $0				// X[5][4][3][2][1][0]O
+    vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16	$0, $2, $1			// 0+1*[50]-5*[41]+20[32]
+    vpadd.s16	$0, $0, $0
+    vpadd.s16	$0, $0, $0
+    vqrshrun.s16	$0, $4, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q13, $2, $6
+    vrshrn.u16		$6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		$6, q12, #5
+    vaddl.u8	q13, $3, $6
+    vrshrn.u16		$6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+    vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
+    vaddl.u8	q13, $2, $3	//src[0]+src[1]
+    vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
+    vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS
+//	{	// input:a, b, c, dst_d;
+    vsub.s16	$0, $0, $1			//a-b
+    vshr.s16	$0, $0, #2			//(a-b)/4
+    vsub.s16	$0, $0, $1			//(a-b)/4-b
+    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
+    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	$3, $0, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	UNPACK_2_16BITS_TO_ABC
+//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    vext.16	$4, $0, $1, #2		//src[0]
+    vext.16	$3, $0, $1, #3		//src[1]
+    vadd.s16	$4, $3					//c=src[0]+src[1]
+
+    vext.16	$3, $0, $1, #1		//src[-1]
+    vext.16	$2, $0, $1, #4		//src[2]
+    vadd.s16	$3, $2					//b=src[-1]+src[2]
+
+    vext.16	$2, $0, $1, #5		//src[3]
+    vadd.s16	$2, $0					//a=src[-2]+src[3]
+//	}
+.endm
+
+.macro	UNPACK_1_IN_8x16BITS_TO_8BITS
+//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],
+    vrev64.16	$1, $1
+    vadd.u16	$2, $1				// C[2+3],B[1+4],A[0+5],
+    vshr.s64	$1, $2, #16
+    vshr.s64	$0, $2, #32		// Output: C $2, B $1, A $0
+
+    vsub.s16	$0, $0, $1			//a-b
+    vshr.s16	$0, $0, #2			//(a-b)/4
+    vsub.s16	$0, $0, $1			//(a-b)/4-b
+    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
+    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	$1, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	$0, $3, #6		//(+32)>>6
+//	}
+.endm
+#else
+.macro	AVERAGE_TWO_8BITS arg0, arg1, arg2
+//	{	// input:dst_d, src_d A and B; working: q13
+    vaddl.u8	q13, \arg2, \arg1
+    vrshrn.u16		\arg0, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+//	}
+.endm
+
+.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used
+//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
+    vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O
+    vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*
+    vmul.s16	\arg0, \arg2, \arg1			// 0+1*[50]-5*[41]+20[32]
+    vpadd.s16	\arg0, \arg0, \arg0
+    vpadd.s16	\arg0, \arg0, \arg0
+    vqrshrun.s16	\arg0, \arg4, #5
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q13, \arg2, \arg6
+    vrshrn.u16		\arg6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
+    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
+    vqrshrun.s16		\arg6, q12, #5
+    vaddl.u8	q13, \arg3, \arg6
+    vrshrn.u16		\arg6, q13, #1
+//	}
+.endm
+
+.macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
+//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
+    vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
+    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
+    vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
+    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
+    vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
+//	}
+.endm
+
+.macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
+//	{	// input:a, b, c, dst_d;
+    vsub.s16	\arg0, \arg0, \arg1			//a-b
+    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
+    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
+    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
+    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
+//	}
+.endm
+
+.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
+//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
+    vext.16	\arg4, \arg0, \arg1, #2		//src[0]
+    vext.16	\arg3, \arg0, \arg1, #3		//src[1]
+    vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
+
+    vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
+    vext.16	\arg2, \arg0, \arg1, #4		//src[2]
+    vadd.s16	\arg3,\arg2					//b=src[-1]+src[2]
+
+    vext.16	\arg2, \arg0, \arg1, #5		//src[3]
+    vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
+//	}
+.endm
+
+.macro	UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
+//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
+    vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]
+    vrev64.16	\arg1, \arg1
+    vadd.u16	\arg2, \arg1				// C[2+3],B[1+4],A[0+5]
+    vshr.s64	\arg1, \arg2, #16
+    vshr.s64	\arg0, \arg2, #32		// Output: C \arg2, B \arg1, A \arg0
+
+    vsub.s16	\arg0, \arg0, \arg1			//a-b
+    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
+    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
+    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
+    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
+    vadd.s16	\arg1, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    vqrshrun.s16	\arg0, \arg3, #6		//(+32)>>6
+//	}
+.endm
+#endif
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w16_h_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
+	pld			[r0]
+	pld			[r0, #16]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q8, q0, q1, #3		//q8=src[1]
+	vext.8		q9, q0, q1, #4		//q9=src[2]
+	vext.8		q10, q0, q1, #5		//q10=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+
+	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+
+	cmp		r4, #0
+	bne		w16_h_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w8_h_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d1}, [r2], r3
+
+	cmp		r4, #0
+	bne		w8_h_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
+	push		{r4, r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w4_h_mc_luma_loop:
+	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
+	pld			[r0]
+	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
+	pld			[r0]
+
+	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
+	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
+	vext.8		q3, q2, q2, #1		//src[0:6 *]
+	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+
+	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
+	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
+	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+
+	FILTER_6TAG_8BITS 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+	vmov		r4, r5, d1
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub		r6, #2
+	cmp		r6, #0
+	bne		w4_h_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w16_xy_10_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
+	pld			[r0]
+	pld			[r0, #16]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q8, q0, q1, #3		//q8=src[1]
+	vext.8		q9, q0, q1, #4		//q9=src[2]
+	vext.8		q10, q0, q1, #5		//q10=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+
+	cmp		r4, #0
+	bne		w16_xy_10_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w8_xy_10_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d1}, [r2], r3
+
+	cmp		r4, #0
+	bne		w8_xy_10_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
+	push		{r4, r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w4_xy_10_mc_luma_loop:
+	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
+	pld			[r0]
+	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
+	pld			[r0]
+
+	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
+	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
+	vext.8		q3, q2, q2, #1		//src[0:6 *]
+	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+
+	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
+	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
+	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+	vmov		r4, r5, d1
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub		r6, #2
+	cmp		r6, #0
+	bne		w4_xy_10_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w16_xy_30_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
+	pld			[r0]
+	pld			[r0, #16]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q8, q0, q1, #3		//q8=src[1]
+	vext.8		q9, q0, q1, #4		//q9=src[2]
+	vext.8		q10, q0, q1, #5		//q10=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d16, d18, d20, d2, q14, q15
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d5, d7, d17, d19, d21, d3, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
+
+	cmp		r4, #0
+	bne		w16_xy_30_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w8_xy_30_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d3, d4, d5, d6, d1, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d1}, [r2], r3
+
+	cmp		r4, #0
+	bne		w8_xy_30_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
+	push		{r4, r5, r6}
+	ldr			r6, [sp, #12]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w4_xy_30_mc_luma_loop:
+	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
+	pld			[r0]
+	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
+	pld			[r0]
+
+	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
+	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
+	vext.8		q3, q2, q2, #1		//src[0:6 *]
+	vext.8		q8, q2, q2, #2		//src[1:6 * *]
+
+	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
+	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
+	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
+	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d7, d2, d5, d1, q14, q15
+
+	vmov		r4, r5, d1
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub		r6, #2
+	cmp		r6, #0
+	bne		w4_xy_30_mc_luma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+
+w16_xy_01_luma_loop:
+
+	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d4, d6, d16, d18, d0, d2, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d5, d7, d17, d19, d1, d3, d21, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d6, d16, d18, d0, d2, d4, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d7, d17, d19, d1, d3, d5, d21, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d16, d18, d0, d2, d4, d6, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d17, d19, d1, d3, d5, d7, d21, q14, q15
+	vld1.u8	{q8}, [r0], r1		//read 6th row
+	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d18, d0, d2, d4, d6, d16, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d19, d1, d3, d5, d7, d17, d21, q14, q15
+	vld1.u8	{q9}, [r0], r1		//read 7th row
+	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q4, q5, q0 --> q0~q4
+	vswp	q0, q8
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q9						//q0~q4
+
+	sub		r4, #8
+	cmp		r4, #0
+	bne		w16_xy_01_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w8_xy_01_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d2, d3, d4, d5, d0, d16, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d4, d5, d0, d1, d2, d16, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w8_xy_01_mc_luma_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
+	push		{r4, r5, r6, r7}
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	ldr		r4, [r0], r1		//r4=src[-2]
+	ldr		r5, [r0], r1		//r5=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	ldr		r6, [r0], r1		//r6=src[0]
+	ldr		r7, [r0], r1		//r7=src[1]
+
+	vmov		d0, r4, r5
+	vmov		d1, r5, r6
+	vmov		d2, r6, r7
+
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d3, r7, r4
+	ldr			r7, [sp, #16]
+
+w4_xy_01_mc_luma_loop:
+
+//	pld			[r0]
+	//using reserving r4
+	ldr		r5, [r0], r1		//r5=src[3]
+	ldr		r6, [r0], r1		//r6=src[0]
+	vmov		d4, r4, r5
+	vmov		d5, r5, r6			//reserved r6
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vmov		r4, r5, d16
+	str	r4, [r2], r3			//write 1st 4Byte
+	str	r5, [r2], r3			//write 2nd 4Byte
+
+	ldr		r5, [r0], r1		//r5=src[1]
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d0, r6, r5
+	vmov		d1, r5, r4			//reserved r4
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vmov		r5, r6, d16
+	str	r5, [r2], r3			//write 3rd 4Byte
+	str	r6, [r2], r3			//write 4th 4Byte
+
+	//d4, d5, d0, d1 --> d0, d1, d2, d3
+	vmov	q1, q0
+	vmov	q0, q2
+
+	sub		r7, #4
+	cmp		r7, #0
+	bne		w4_xy_01_mc_luma_loop
+
+	pop		{r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+
+w16_xy_03_luma_loop:
+
+	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d4, d6, d16, d18, d0, d2, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d5, d7, d17, d19, d1, d3, d21, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d6, d16, d18, d0, d2, d4, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d7, d17, d19, d1, d3, d5, d21, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d16, d18, d0, d2, d4, d6, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d17, d19, d1, d3, d5, d7, d21, q14, q15
+	vld1.u8	{q8}, [r0], r1		//read 6th row
+	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d18, d0, d2, d4, d6, d16, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d19, d1, d3, d5, d7, d17, d21, q14, q15
+	vld1.u8	{q9}, [r0], r1		//read 7th row
+	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q8, q9, q0 --> q0~q8
+	vswp	q0, q8
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q9						//q0~q8
+
+	sub		r4, #8
+	cmp		r4, #0
+	bne		w16_xy_03_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w8_xy_03_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d2, d3, d4, d5, d0, d16, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d4, d5, d0, d1, d2, d16, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w8_xy_03_mc_luma_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
+	push		{r4, r5, r6, r7}
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	ldr		r4, [r0], r1		//r4=src[-2]
+	ldr		r5, [r0], r1		//r5=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	ldr		r6, [r0], r1		//r6=src[0]
+	ldr		r7, [r0], r1		//r7=src[1]
+
+	vmov		d0, r4, r5
+	vmov		d1, r5, r6
+	vmov		d2, r6, r7
+
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d3, r7, r4
+	ldr			r7, [sp, #16]
+
+w4_xy_03_mc_luma_loop:
+
+//	pld			[r0]
+	//using reserving r4
+	ldr		r5, [r0], r1		//r5=src[3]
+	ldr		r6, [r0], r1		//r6=src[0]
+	vmov		d4, r4, r5
+	vmov		d5, r5, r6			//reserved r6
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vmov		r4, r5, d16
+	str	r4, [r2], r3			//write 1st 4Byte
+	str	r5, [r2], r3			//write 2nd 4Byte
+
+	ldr		r5, [r0], r1		//r5=src[1]
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d0, r6, r5
+	vmov		d1, r5, r4			//reserved r4
+
+	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vmov		r5, r6, d16
+	str	r5, [r2], r3			//write 3rd 4Byte
+	str	r6, [r2], r3			//write 4th 4Byte
+
+	//d4, d5, d0, d1 --> d0, d1, d2, d3
+	vmov	q1, q0
+	vmov	q0, q2
+
+	sub		r7, #4
+	cmp		r7, #0
+	bne		w4_xy_03_mc_luma_loop
+
+	pop		{r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+
+w16_v_mc_luma_loop:
+
+	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
+	vld1.u8	{q8}, [r0], r1		//read 6th row
+	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
+	vld1.u8	{q9}, [r0], r1		//read 7th row
+	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q8, q9, q0 --> q0~q8
+	vswp	q0, q8
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q9						//q0~q8
+
+	sub		r4, #8
+	cmp		r4, #0
+	bne		w16_v_mc_luma_loop
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w8_v_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w8_v_mc_luma_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
+	push		{r4, r5, r6, r7}
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	ldr		r4, [r0], r1		//r4=src[-2]
+	ldr		r5, [r0], r1		//r5=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	ldr		r6, [r0], r1		//r6=src[0]
+	ldr		r7, [r0], r1		//r7=src[1]
+
+	vmov		d0, r4, r5
+	vmov		d1, r5, r6
+	vmov		d2, r6, r7
+
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d3, r7, r4
+	ldr			r7, [sp, #16]
+
+w4_v_mc_luma_loop:
+
+//	pld			[r0]
+	//using reserving r4
+	ldr		r5, [r0], r1		//r5=src[3]
+	ldr		r6, [r0], r1		//r6=src[0]
+	vmov		d4, r4, r5
+	vmov		d5, r5, r6			//reserved r6
+
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vmov		r4, r5, d16
+	str	r4, [r2], r3			//write 1st 4Byte
+	str	r5, [r2], r3			//write 2nd 4Byte
+
+	ldr		r5, [r0], r1		//r5=src[1]
+	ldr		r4, [r0], r1		//r4=src[2]
+	vmov		d0, r6, r5
+	vmov		d1, r5, r4			//reserved r4
+
+	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vmov		r5, r6, d16
+	str	r5, [r2], r3			//write 3rd 4Byte
+	str	r6, [r2], r3			//write 4th 4Byte
+
+	//d4, d5, d0, d1 --> d0, d1, d2, d3
+	vmov	q1, q0
+	vmov	q0, q2
+
+	sub		r7, #4
+	cmp		r7, #0
+	bne		w4_v_mc_luma_loop
+
+	pop		{r4, r5, r6, r7}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
+	push		{r4}
+	vpush		{q4-q7}
+	ldr			r4, [sp, #68]
+
+	sub			r0, #2					//src[-2]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
+	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+
+	vld1.u8	{d6-d8}, [r0], r1		//use 21(16+5), =src[0]
+	vld1.u8	{d9-d11}, [r0], r1	//use 21(16+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{d12-d14}, [r0], r1	//use 21(16+5), =src[2]
+
+w16_hv_mc_luma_loop:
+
+	vld1.u8	{d15-d17}, [r0], r1	//use 21(16+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
+	vst1.u8	{q0}, [r2], r3		//write 16Byte
+
+
+	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
+
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
+
+	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte
+
+	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
+
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
+	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte
+
+	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
+	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
+
+	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+	vswp	q0, q6
+	vswp	q6, q3
+	vmov	q5, q2
+	vmov	q2, q8
+
+	vmov	d20,d8
+	vmov	q4, q1
+	vmov	q1, q7
+	vmov	d14,d20
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w16_hv_mc_luma_loop
+	vpop		{q4-q7}
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
+	push		{r4}
+	vpush		{q4}
+	ldr			r4, [sp, #20]
+
+	sub			r0, #2				//src[-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014		// 20
+	vld1.u8	{q0}, [r0], r1	//use 13(8+5), =src[-2]
+	vld1.u8	{q1}, [r0], r1	//use 13(8+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2		// 5
+
+	vld1.u8	{q2}, [r0], r1	//use 13(8+5), =src[0]
+	vld1.u8	{q3}, [r0], r1	//use 13(8+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{q4}, [r0], r1	//use 13(8+5), =src[2]
+
+w8_hv_mc_luma_loop:
+
+	vld1.u8	{q8}, [r0], r1	//use 13(8+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2], r3			//write 8Byte
+
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2], r3		//write 8Byte
+
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2], r3			//write 8Byte
+
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 5 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2], r3			//write 8Byte
+
+	//q4~q5, q0~q2, --> q0~q4
+	vswp	q0, q4
+	vswp	q2, q4
+	vmov	q3, q1
+	vmov	q1, q8
+
+	sub		r4, #4
+	cmp		r4, #0
+	bne		w8_hv_mc_luma_loop
+	vpop		{q4}
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
+	push		{r4 ,r5, r6}
+	vpush		{q4-q7}
+	ldr			r6, [sp, #76]
+
+	sub			r0, #2				//src[-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014		// 20
+	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[-2]
+	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2		// 5
+
+	vld1.u8	{q2}, [r0], r1	//use 9(4+5), =src[0]
+	vld1.u8	{q3}, [r0], r1	//use 9(4+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{q4}, [r0], r1	//use 9(4+5), =src[2]
+
+w4_hv_mc_luma_loop:
+
+	vld1.u8	{q5}, [r0], r1	//use 9(4+5), =src[3]
+	vld1.u8	{q6}, [r0], r1	//use 9(4+5), =src[4]
+
+	//the 1st&2nd row
+	pld			[r0]
+	pld			[r0, r1]
+	// vertical filtered
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q7, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q8, q14, q15	// 1 avail
+
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8,d10, d12, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9,d11, d13,q10, q14, q15	// 1 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
+	UNPACK_2_16BITS_TO_ABC	q9,q10, q0, q7, q8		//4 avail
+
+	vmov	d23, d0
+	vmov	d25, d14
+	vmov	d27, d16
+
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
+	vmov		r4 ,r5, d22
+	str		r4, [r2], r3				//write 4Byte
+	str		r5, [r2], r3				//write 4Byte
+
+	//the 3rd&4th row
+	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
+	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[4]
+	pld			[r0]
+	pld			[r0, r1]
+	// vertical filtered
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d12, d0, q7, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d13, d1, q8, q14, q15	// 1 avail
+
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8,d10, d12, d0, d2, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9,d11, d13, d1, d3,q10, q14, q15	// 1 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
+	UNPACK_2_16BITS_TO_ABC	q9,q10, q2, q7, q8		//4 avail
+
+	vmov	d23, d4
+	vmov	d25, d14
+	vmov	d27, d16
+
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
+	vmov		r4 ,r5, d22
+	str		r4, [r2], r3				//write 4Byte
+	str		r5, [r2], r3				//write 4Byte
+
+	//q4~q6, q0~q1, --> q0~q4
+	vswp	q4, q0
+	vmov	q3, q4
+	vmov	q4, q1
+	vmov	q1, q5
+	vmov	q2, q6
+
+	sub		r6, #4
+	cmp		r6, #0
+	bne		w4_hv_mc_luma_loop
+
+	vpop		{q4-q7}
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+w16_copy_loop:
+	vld1.u8		{q0}, [r0], r1
+	sub			r4, #2
+	vld1.u8		{q1}, [r0], r1
+	vst1.u8		{q0}, [r2], r3
+	cmp			r4, #0
+	vst1.u8		{q1}, [r2], r3
+	bne			w16_copy_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+w8_copy_loop:
+	vld1.u8		{d0}, [r0], r1
+	vld1.u8		{d1}, [r0], r1
+	vst1.u8		{d0}, [r2], r3
+	vst1.u8		{d1}, [r2], r3
+	sub			r4, #2
+	cmp			r4, #0
+	bne			w8_copy_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+w4_copy_loop:
+	ldr		r5, [r0], r1
+	ldr		r6, [r0], r1
+	str		r5, [r2], r3
+	str		r6, [r2], r3
+
+	sub			r4, #2
+	cmp			r4, #0
+	bne			w4_copy_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+w16_pix_avg_loop:
+	vld1.u8		{q0}, [r2]!
+	vld1.u8		{q1}, [r3]!
+	vld1.u8		{q2}, [r2]!
+	vld1.u8		{q3}, [r3]!
+
+	vld1.u8		{q8}, [r2]!
+	vld1.u8		{q9}, [r3]!
+	vld1.u8		{q10}, [r2]!
+	vld1.u8		{q11}, [r3]!
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{q0}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{q2}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d16, d16, d18
+	AVERAGE_TWO_8BITS		d17, d17, d19
+	vst1.u8		{q8}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d20, d20, d22
+	AVERAGE_TWO_8BITS		d21, d21, d23
+	vst1.u8		{q10}, [r0], r1
+
+	sub			r4, #4
+	cmp			r4, #0
+	bne			w16_pix_avg_loop
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
+	push		{r4, r5}
+	ldr			r4, [sp, #8]
+	mov			r5, #16
+w8_pix_avg_loop:
+
+	vld1.u8		{d0}, [r2], r5
+	vld1.u8		{d2}, [r3], r5
+	vld1.u8		{d1}, [r2], r5
+	vld1.u8		{d3}, [r3], r5
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{d0}, [r0], r1
+	vst1.u8		{d1}, [r0], r1
+
+	vld1.u8		{d4}, [r2], r5
+	vld1.u8		{d6}, [r3], r5
+	vld1.u8		{d5}, [r2], r5
+	vld1.u8		{d7}, [r3], r5
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{d4}, [r0], r1
+	vst1.u8		{d5}, [r0], r1
+
+	sub			r4, #4
+	cmp			r4, #0
+	bne			w8_pix_avg_loop
+
+	pop		{r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
+	push		{r4-r8}
+	ldr			r4, [sp, #20]
+w4_pix_avg_loop:
+
+	ldr		r5, [r2]
+	ldr		r6, [r2, #16]
+	ldr		r7, [r3]
+	ldr		r8, [r3, #16]
+	add		r2, #32
+	add		r3, #32
+
+	vmov		d0, r5, r6
+	vmov		d1, r7, r8
+	AVERAGE_TWO_8BITS		d0, d0, d1
+	vmov		r5, r6, d0
+
+	str		r5, [r0], r1
+	str		r6, [r0], r1
+
+	sub			r4, #2
+	cmp			r4, #0
+	bne			w4_pix_avg_loop
+
+	pop		{r4-r8}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
+	push		{r4, r5}
+	ldr			r4, [sp, #8]
+	ldr			r5, [sp, #12]
+//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+//	we can opti it by adding vert only/ hori only cases, to be continue
+	vld1.u8	{d31}, [r4]		//load A/B/C/D
+	vld1.u8		{q0}, [r0], r1	//src[x]
+
+	vdup.u8	d28, d31[0]			//A
+	vdup.u8	d29, d31[1]			//B
+	vdup.u8	d30, d31[2]			//C
+	vdup.u8	d31, d31[3]			//D
+
+	vext.u8		d1, d0, d1, #1		//src[x+1]
+
+w8_mc_chroma_loop:	// each two pxl row
+	vld1.u8		{q1}, [r0], r1	//src[x+stride]
+	vld1.u8		{q2}, [r0], r1	//src[x+2*stride]
+	vext.u8		d3, d2, d3, #1		//src[x+stride+1]
+	vext.u8		d5, d4, d5, #1		//src[x+2*stride+1]
+
+	vmull.u8		q3, d0, d28			//(src[x] * A)
+	vmlal.u8		q3, d1, d29			//+=(src[x+1] * B)
+	vmlal.u8		q3, d2, d30			//+=(src[x+stride] * C)
+	vmlal.u8		q3, d3, d31			//+=(src[x+stride+1] * D)
+	vrshrn.u16		d6, q3, #6
+	vst1.u8	d6, [r2], r3
+
+	vmull.u8		q3, d2, d28			//(src[x] * A)
+	vmlal.u8		q3, d3, d29			//+=(src[x+1] * B)
+	vmlal.u8		q3, d4, d30			//+=(src[x+stride] * C)
+	vmlal.u8		q3, d5, d31			//+=(src[x+stride+1] * D)
+	vrshrn.u16		d6, q3, #6
+	vst1.u8	d6, [r2], r3
+
+	vmov		q0, q2
+	sub			r5, #2
+	cmp			r5, #0
+	bne			w8_mc_chroma_loop
+
+	pop		{r4, r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
+
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+	ldr			r6, [sp, #16]
+//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
+//	we can opti it by adding vert only/ hori only cases, to be continue
+	vld1.u8	{d31}, [r4]		//load A/B/C/D
+
+	vdup.u8	d28, d31[0]			//A
+	vdup.u8	d29, d31[1]			//B
+	vdup.u8	d30, d31[2]			//C
+	vdup.u8	d31, d31[3]			//D
+
+w4_mc_chroma_loop:	// each two pxl row
+	vld1.u8		{d0}, [r0], r1	//a::src[x]
+	vld1.u8		{d2}, [r0], r1	//b::src[x+stride]
+	vld1.u8		{d4}, [r0]			//c::src[x+2*stride]
+
+	vshr.u64		d1, d0, #8
+	vshr.u64		d3, d2, #8
+	vshr.u64		d5, d4, #8
+
+	vmov			q3, q1				//b::[0:7]+b::[1~8]
+	vtrn.32		q0, q1				//d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
+	vtrn.32		q3, q2				//d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
+
+	vmull.u8		q1, d0, d28			//(src[x] * A)
+	vmlal.u8		q1, d1, d29			//+=(src[x+1] * B)
+	vmlal.u8		q1, d6, d30			//+=(src[x+stride] * C)
+	vmlal.u8		q1, d7, d31			//+=(src[x+stride+1] * D)
+
+	vrshrn.u16		d2, q1, #6
+	vmov		r4, r5, d2
+	str	r4, [r2], r3
+	str	r5, [r2], r3
+
+	sub			r6, #2
+	cmp			r6, #0
+	bne			w4_mc_chroma_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
+	push		{r4-r5}
+	mov			r4, #20
+	mov			r5, #1
+	sub			r4, r4, r4, lsl #(16-2)
+	lsl			r5, #16
+	ror			r4, #16
+	vmov		d3, r5, r4					// 0x0014FFFB00010000
+
+	sub			r3, #16
+	ldr			r4, [sp, #8]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w17_h_mc_luma_loop:
+	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 22(17+5); q0=src[-2]
+
+	vext.8		q2, q0, q1, #1		//q2=src[-1]
+	vext.8		q3, q0, q1, #2		//q3=src[0]
+	vext.8		q8, q0, q1, #3		//q8=src[1]
+	vext.8		q9, q0, q1, #4		//q9=src[2]
+	vext.8		q10, q0, q1, #5		//q10=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d22, q14, q15
+
+	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d23, q14, q15
+
+	vst1.u8	{d22, d23}, [r2]!		//write [0:15] Byte
+
+	vsli.64	d2, d2, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+	FILTER_SINGLE_TAG_8BITS	d2, d3, d22, q11, q1
+
+	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+
+	sub		r4, #1
+	cmp		r4, #0
+	bne		w17_h_mc_luma_loop
+	pop		{r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
+	push		{r4-r5}
+	mov			r4, #20
+	mov			r5, #1
+	sub			r4, r4, r4, lsl #(16-2)
+	lsl			r5, #16
+	ror			r4, #16
+	vmov		d7, r5, r4					// 0x0014FFFB00010000
+
+	sub			r3, #8
+	ldr			r4, [sp, #8]
+
+	sub			r0, #2
+	vmov.u16	q14, #0x0014				// 20
+	vshr.u16	q15, q14, #2				// 5
+
+w9_h_mc_luma_loop:
+	vld1.u8	{d0,d1}, [r0], r1	//only use 14(9+5); q0=src[-2]
+	pld			[r0]
+
+	vext.8		d2, d0, d1, #1		//d2=src[-1]
+	vext.8		d3, d0, d1, #2		//d3=src[0]
+	vext.8		d4, d0, d1, #3		//d4=src[1]
+	vext.8		d5, d0, d1, #4		//d5=src[2]
+	vext.8		d6, d0, d1, #5		//d6=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d16, q14, q15
+
+	sub		r4, #1
+	vst1.u8	{d16}, [r2]!		//write [0:7] Byte
+
+	vsli.64	d2, d1, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
+	FILTER_SINGLE_TAG_8BITS	d2, d7, d18, q9, q1
+	vst1.u8	{d2[0]}, [r2], r3		//write 8th Byte
+
+	cmp		r4, #0
+	bne		w9_h_mc_luma_loop
+	pop		{r4-r5}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
+	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{q2}, [r0], r1		//q2=src[0]
+	vld1.u8	{q3}, [r0], r1		//q3=src[1]
+	vld1.u8	{q8}, [r0], r1		//q8=src[2]
+
+w17_v_mc_luma_loop:
+
+	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
+
+	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
+
+	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
+	vld1.u8	{q3}, [r0], r1		//read 5th row
+	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
+
+	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
+	vld1.u8	{q8}, [r0], r1		//read 6th row
+	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
+
+	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
+	vld1.u8	{q9}, [r0], r1		//read 7th row
+	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
+
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vld1.u8	{q0}, [r0], r1		//read 8th row
+	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
+
+	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
+	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
+
+	//q2, q3, q8, q9, q0 --> q0~q8
+	vswp	q0, q8
+	vswp	q0, q2
+	vmov	q1, q3
+	vmov	q3, q9						//q0~q8
+
+	sub		r4, #8
+	cmp		r4, #1
+	bne		w17_v_mc_luma_loop
+	// the last 16Bytes
+	vld1.u8	{q9}, [r0], r1		//q9=src[3]
+	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
+	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
+	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
+	push		{r4}
+	ldr			r4, [sp, #4]
+
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
+	pld			[r0]
+	pld			[r0, r1]
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
+	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+	vld1.u8	{d2}, [r0], r1		//d2=src[0]
+	vld1.u8	{d3}, [r0], r1		//d3=src[1]
+
+	vld1.u8	{d4}, [r0], r1		//d4=src[2]
+	vld1.u8	{d5}, [r0], r1		//d5=src[3]
+
+w9_v_mc_luma_loop:
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vld1.u8	{d0}, [r0], r1		//read 2nd row
+	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
+	vld1.u8	{d1}, [r0], r1		//read 3rd row
+	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
+	vld1.u8	{d2}, [r0], r1		//read 4th row
+	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
+
+	pld			[r0]
+	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
+	vld1.u8	{d3}, [r0], r1		//read 5th row
+	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
+
+	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
+	vswp	q0, q2
+	vswp	q1, q2
+
+	sub		r4, #4
+	cmp		r4, #1
+	bne		w9_v_mc_luma_loop
+
+	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
+	vst1.u8	{d16}, [r2], r3		//write last 8Byte
+
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
+	push		{r4}
+	vpush		{q4-q7}
+	ldr			r4, [sp, #68]
+
+	sub			r0, #2					//src[-2]
+	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014			// 20
+	vld1.u8	{d0-d2}, [r0], r1		//use 21(17+5), =src[-2]
+	vld1.u8	{d3-d5}, [r0], r1		//use 21(17+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2			// 5
+
+	vld1.u8	{d6-d8}, [r0], r1		//use 21(17+5), =src[0]
+	vld1.u8	{d9-d11}, [r0], r1	//use 21(17+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{d12-d14}, [r0], r1	//use 21(17+5), =src[2]
+	sub			r3, #16
+
+w17_hv_mc_luma_loop:
+
+	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
+	vst1.u8	{d0, d1}, [r2]!			//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
+	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+
+	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
+	vst1.u8	{d3, d4}, [r2]!		//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d5, d22, d23, q11 //output to d5[0]
+	vst1.u8	{d5[0]}, [r2], r3		//write 16th Byte
+
+	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
+	vst1.u8	{d6, d7}, [r2]!		//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d8, d22, d23, q11 //output to d8[0]
+	vst1.u8	{d8[0]}, [r2], r3		//write 16th Byte
+
+	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
+	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d11, d22, d23, q11 //output to d11[0]
+	vst1.u8	{d11[0]}, [r2], r3		//write 16th Byte
+
+	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
+	vswp	q0, q6
+	vswp	q6, q3
+	vmov	q5, q2
+	vmov	q2, q8
+
+	vmov	d20,d8
+	vmov	q4, q1
+	vmov	q1, q7
+	vmov	d14,d20
+
+	sub		r4, #4
+	cmp		r4, #1
+	bne		w17_hv_mc_luma_loop
+	//the last row
+	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
+	// vertical filtered into q10/q11
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
+	vst1.u8	{q0}, [r2]!			//write 16Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
+	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
+
+	vpop		{q4-q7}
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
+	push		{r4}
+	vpush		{q4}
+	ldr			r4, [sp, #20]
+
+	sub			r0, #2				//src[-2]
+	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
+	pld			[r0]
+	pld			[r0, r1]
+
+	vmov.u16	q14, #0x0014		// 20
+	vld1.u8	{q0}, [r0], r1	//use 14(9+5), =src[-2]
+	vld1.u8	{q1}, [r0], r1	//use 14(9+5), =src[-1]
+
+	pld			[r0]
+	pld			[r0, r1]
+	vshr.u16	q15, q14, #2		// 5
+
+	vld1.u8	{q2}, [r0], r1	//use 14(9+5), =src[0]
+	vld1.u8	{q3}, [r0], r1	//use 14(9+5), =src[1]
+	pld			[r0]
+	pld			[r0, r1]
+	vld1.u8	{q4}, [r0], r1	//use 14(9+5), =src[2]
+	sub			r3, #8
+
+w9_hv_mc_luma_loop:
+
+	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
+	//the 1st row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
+	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+
+	vld1.u8	{q0}, [r0], r1		//read 2nd row
+	//the 2nd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
+	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+
+	vld1.u8	{q1}, [r0], r1		//read 3rd row
+	//the 3rd row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
+	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+
+	vld1.u8	{q2}, [r0], r1		//read 4th row
+	//the 4th row
+	pld			[r0]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2]!			//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
+	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+
+	//q4~q8, q0~q2, --> q0~q4
+	vswp	q0, q4
+	vswp	q2, q4
+	vmov	q3, q1
+	vmov	q1, q8
+
+	sub		r4, #4
+	cmp		r4, #1
+	bne		w9_hv_mc_luma_loop
+	//the last row
+	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
+	// vertical filtered into q9/q10
+	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
+	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
+	// horizon filtered
+	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
+	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
+	vst1.u8	d18, [r2]!				//write 8Byte
+	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
+	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
+	vpop		{q4}
+	pop		{r4}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+	ldr			r5, [sp, #16]
+	ldr			r6, [sp, #20]
+
+enc_w16_pix_avg_loop:
+	vld1.u8		{q0}, [r2], r3
+	vld1.u8		{q1}, [r4], r5
+	vld1.u8		{q2}, [r2], r3
+	vld1.u8		{q3}, [r4], r5
+
+	vld1.u8		{q8}, [r2], r3
+	vld1.u8		{q9}, [r4], r5
+	vld1.u8		{q10}, [r2], r3
+	vld1.u8		{q11}, [r4], r5
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{q0}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{q2}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d16, d16, d18
+	AVERAGE_TWO_8BITS		d17, d17, d19
+	vst1.u8		{q8}, [r0], r1
+
+	AVERAGE_TWO_8BITS		d20, d20, d22
+	AVERAGE_TWO_8BITS		d21, d21, d23
+	vst1.u8		{q10}, [r0], r1
+
+	sub			r6, #4
+	cmp			r6, #0
+	bne			enc_w16_pix_avg_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+
+WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
+	push		{r4, r5, r6}
+	ldr			r4, [sp, #12]
+	ldr			r5, [sp, #16]
+	ldr			r6, [sp, #20]
+enc_w8_pix_avg_loop:
+
+	vld1.u8		{d0}, [r2], r3
+	vld1.u8		{d2}, [r4], r5
+	vld1.u8		{d1}, [r2], r3
+	vld1.u8		{d3}, [r4], r5
+
+	AVERAGE_TWO_8BITS		d0, d0, d2
+	AVERAGE_TWO_8BITS		d1, d1, d3
+	vst1.u8		{d0}, [r0], r1
+	vst1.u8		{d1}, [r0], r1
+
+	vld1.u8		{d4}, [r2], r3
+	vld1.u8		{d6}, [r4], r5
+	vld1.u8		{d5}, [r2], r3
+	vld1.u8		{d7}, [r4], r5
+
+	AVERAGE_TWO_8BITS		d4, d4, d6
+	AVERAGE_TWO_8BITS		d5, d5, d7
+	vst1.u8		{d4}, [r0], r1
+	vst1.u8		{d5}, [r0], r1
+
+	sub			r6, #4
+	cmp			r6, #0
+	bne			enc_w8_pix_avg_loop
+
+	pop		{r4, r5, r6}
+WELS_ASM_FUNC_END
+
+#endif
--- a/codec/common/arm_arch_common_macro.S
+++ /dev/null
@@ -1,64 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef __APPLE__
-
-.macro WELS_ASM_FUNC_BEGIN
-.align 2
-.arm
-.globl _$0
-_$0:
-.endm
-
-.macro WELS_ASM_FUNC_END
-mov pc, lr
-.endm
-#else
-
-.syntax unified
-.section .note.GNU-stack,"",%progbits // Mark stack as non-executable
-.text
-
-.macro WELS_ASM_FUNC_BEGIN funcName
-.align 2
-.arm
-.global \funcName
-.type \funcName, %function
-.func \funcName
-\funcName:
-.endm
-
-.macro WELS_ASM_FUNC_END
-mov pc, lr
-.endfunc
-.endm
-#endif
--- a/codec/common/asm_inc.asm
+++ /dev/null
@@ -1,599 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  sse2inc.asm
-;*
-;*  Abstract
-;*      macro and constant
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
-	%define MOVDQ movdqa
-%else
-	%define MOVDQ movdqu
-%endif
-
-%if 1
-	%define WELSEMMS	emms
-%else
-	%define WELSEMMS
-%endif
-
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-DEFAULT REL
-
-%ifdef WIN64 ; Windows x64 ;************************************
-
-BITS 64
-
-%define arg1 rcx
-%define arg2 rdx
-%define arg3 r8
-%define arg4 r9
-%define arg5 [rsp + push_num*8 + 40]
-%define arg6 [rsp + push_num*8 + 48]
-%define arg7 [rsp + push_num*8 + 56]
-%define arg8 [rsp + push_num*8 + 64]
-%define arg9 [rsp + push_num*8 + 72]
-%define arg10 [rsp + push_num*8 + 80]
-%define arg11 [rsp + push_num*8 + 88]
-%define arg12 [rsp + push_num*8 + 96]
-
-%define r0 rcx
-%define r1 rdx
-%define r2 r8
-%define r3 r9
-%define r4 rax
-%define r5 r10
-%define r6 r11
-%define r7 rsp
-
-%define r0d ecx
-%define r1d edx
-%define r2d r8d
-%define r3d r9d
-%define r4d eax
-%define r5d r10d
-%define r6d r11d
-
-%define r0w  cx
-%define r1w  dx
-%define r2w  r8w
-%define r3w  r9w
-
-%define r0b  cl
-%define r1b  dl
-%define r2b  r8l
-%define r3b  r9l
-
-%define  PUSHRFLAGS     pushfq
-%define  POPRFLAGS      popfq
-%define  retrq          rax
-%define  retrd          eax
-
-%elifdef UNIX64 ; Unix x64 ;************************************
-
-BITS 64
-
-%define arg1 rdi
-%define arg2 rsi
-%define arg3 rdx
-%define arg4 rcx
-%define arg5 r8
-%define arg6 r9
-%define arg7 [rsp + push_num*8 + 8]
-%define arg8 [rsp + push_num*8 + 16]
-%define arg9 [rsp + push_num*8 + 24]
-%define arg10 [rsp + push_num*8 + 32]
-%define arg11 [rsp + push_num*8 + 40]
-%define arg12 [rsp + push_num*8 + 48]
-
-%define r0 rdi
-%define r1 rsi
-%define r2 rdx
-%define r3 rcx
-%define r4 r8
-%define r5 r9
-%define r6 r10
-%define r7 rsp
-
-%define r0d edi
-%define r1d esi
-%define r2d edx
-%define r3d ecx
-%define r4d r8d
-%define r5d r9d
-%define r6d r10d
-
-%define r0w  di
-%define r1w  si
-%define r2w  dx
-%define r3w  cx
-
-%define r0b  dil
-%define r1b  sil
-%define r2b  dl
-%define r3b  cl
-
-%define  PUSHRFLAGS     pushfq
-%define  POPRFLAGS      popfq
-%define  retrq          rax
-%define  retrd          eax
-
-%elifdef X86_32 ; X86_32 ;************************************
-
-BITS 32
-
-%define arg1 [esp + push_num*4 + 4]
-%define arg2 [esp + push_num*4 + 8]
-%define arg3 [esp + push_num*4 + 12]
-%define arg4 [esp + push_num*4 + 16]
-%define arg5 [esp + push_num*4 + 20]
-%define arg6 [esp + push_num*4 + 24]
-%define arg7 [esp + push_num*4 + 28]
-%define arg8 [esp + push_num*4 + 32]
-%define arg9 [esp + push_num*4 + 36]
-%define arg10 [esp + push_num*4 + 40]
-%define arg11 [esp + push_num*4 + 44]
-%define arg12 [esp + push_num*4 + 48]
-
-%define r0 eax
-%define r1 ecx
-%define r2 edx
-%define r3 ebx
-%define r4 esi
-%define r5 edi
-%define r6 ebp
-%define r7 esp
-
-%define r0d eax
-%define r1d ecx
-%define r2d edx
-%define r3d ebx
-%define r4d esi
-%define r5d edi
-%define r6d ebp
-
-%define r0w ax
-%define r1w cx
-%define r2w dx
-%define r3w bx
-
-%define r0b al
-%define r1b cl
-%define r2b dl
-%define r3b bl
-
-%define  PUSHRFLAGS     pushfd
-%define  POPRFLAGS      popfd
-%define  retrq          eax      ; 32 bit mode do not support 64 bits regesters
-%define  retrd          eax
-
-%endif
-
-%macro LOAD_PARA 2
-    mov %1, %2
-%endmacro
-
-%macro LOAD_1_PARA 0
-    %ifdef X86_32
-	mov r0, [esp + push_num*4 + 4]
-    %endif
-%endmacro
-
-%macro LOAD_2_PARA 0
-    %ifdef X86_32
-        mov r0, [esp + push_num*4 + 4]
-        mov r1, [esp + push_num*4 + 8]
-    %endif
-%endmacro
-
-%macro LOAD_3_PARA 0
-    %ifdef X86_32
-        mov r0, [esp + push_num*4 + 4]
-	mov r1, [esp + push_num*4 + 8]
-	mov r2, [esp + push_num*4 + 12]
-    %endif
-%endmacro
-
-%macro LOAD_4_PARA 0
-    %ifdef X86_32
-        push r3
-        %assign  push_num push_num+1
-        mov r0, [esp + push_num*4 + 4]
-        mov r1, [esp + push_num*4 + 8]
-        mov r2, [esp + push_num*4 + 12]
-        mov r3, [esp + push_num*4 + 16]
-    %endif
-%endmacro
-
-%macro LOAD_5_PARA 0
-    %ifdef X86_32
-        push r3
-        push r4
-        %assign  push_num push_num+2
-        mov r0, [esp + push_num*4 + 4]
-        mov r1, [esp + push_num*4 + 8]
-        mov r2, [esp + push_num*4 + 12]
-        mov r3, [esp + push_num*4 + 16]
-        mov r4, [esp + push_num*4 + 20]
-    %elifdef WIN64
-        mov r4, [rsp + push_num*8 + 40]
-    %endif
-%endmacro
-
-%macro LOAD_6_PARA 0
-    %ifdef X86_32
-	push r3
-        push r4
-        push r5
-        %assign  push_num push_num+3
-        mov r0, [esp + push_num*4 + 4]
-        mov r1, [esp + push_num*4 + 8]
-        mov r2, [esp + push_num*4 + 12]
-        mov r3, [esp + push_num*4 + 16]
-        mov r4, [esp + push_num*4 + 20]
-        mov r5, [esp + push_num*4 + 24]
-    %elifdef WIN64
-        mov r4, [rsp + push_num*8 + 40]
-        mov r5, [rsp + push_num*8 + 48]
-    %endif
-%endmacro
-
-%macro LOAD_7_PARA 0
-    %ifdef X86_32
-        push r3
-        push r4
-        push r5
-        push r6
-        %assign  push_num push_num+4
-        mov r0, [esp + push_num*4 + 4]
-        mov r1, [esp + push_num*4 + 8]
-        mov r2, [esp + push_num*4 + 12]
-        mov r3, [esp + push_num*4 + 16]
-        mov r4, [esp + push_num*4 + 20]
-        mov r5, [esp + push_num*4 + 24]
-        mov r6, [esp + push_num*4 + 28]
-    %elifdef WIN64
-        mov r4, [rsp + push_num*8 + 40]
-        mov r5, [rsp + push_num*8 + 48]
-        mov r6, [rsp + push_num*8 + 56]
-    %elifdef UNIX64
-        mov r6, [rsp + push_num*8 + 8]
-    %endif
-%endmacro
-
-
-
-%macro LOAD_4_PARA_POP 0
-    %ifdef X86_32
-	pop r3
-    %endif
-%endmacro
-
-%macro LOAD_5_PARA_POP 0
-    %ifdef X86_32
-        pop r4
-	pop r3
-    %endif
-%endmacro
-
-%macro LOAD_6_PARA_POP 0
-    %ifdef X86_32
-        pop r5
-  	pop r4
- 	pop r3
-    %endif
-%endmacro
-
-%macro LOAD_7_PARA_POP 0
-    %ifdef X86_32
-        pop r6
-        pop r5
-        pop r4
-        pop r3
-    %endif
-%endmacro
-
-%macro PUSH_XMM 1
-    %ifdef WIN64
-        %assign xmm_num_regs %1
-        %if xmm_num_regs > 6
-            %ifdef push_num
-                %assign push_num push_num+2*(%1-6)
-            %endif
-            sub rsp, 16*(%1 - 6)
-            movdqu [rsp], xmm6
-        %endif
-        %if xmm_num_regs > 7
-            movdqu [rsp+16], xmm7
-        %endif
-        %if xmm_num_regs > 8
-            movdqu [rsp+32], xmm8
-        %endif
-        %if xmm_num_regs > 9
-            movdqu [rsp+48], xmm9
-        %endif
-        %if xmm_num_regs > 10
-            movdqu [rsp+64], xmm10
-        %endif
-        %if xmm_num_regs > 11
-            movdqu [rsp+80], xmm11
-        %endif
-        %if xmm_num_regs > 12
-            movdqu [rsp+96], xmm12
-        %endif
-        %if xmm_num_regs > 13
-            movdqu [rsp+112], xmm13
-        %endif
-        %if xmm_num_regs > 14
-            movdqu [rsp+128], xmm14
-        %endif
-        %if xmm_num_regs > 15
-            movdqu [rsp+144], xmm15
-        %endif
-    %endif
-%endmacro
-
-%macro POP_XMM 0
-    %ifdef WIN64
-        %if xmm_num_regs > 15
-            movdqu xmm15, [rsp+144]
-        %endif
-        %if xmm_num_regs > 14
-            movdqu xmm14, [rsp+128]
-        %endif
-        %if xmm_num_regs > 13
-            movdqu xmm13, [rsp+112]
-        %endif
-        %if xmm_num_regs > 12
-            movdqu xmm12, [rsp+96]
-        %endif
-        %if xmm_num_regs > 11
-            movdqu xmm11, [rsp+80]
-        %endif
-        %if xmm_num_regs > 10
-            movdqu xmm10, [rsp+64]
-        %endif
-        %if xmm_num_regs > 9
-            movdqu xmm9, [rsp+48]
-        %endif
-        %if xmm_num_regs > 8
-            movdqu xmm8, [rsp+32]
-        %endif
-        %if xmm_num_regs > 7
-            movdqu xmm7, [rsp+16]
-        %endif
-        %if xmm_num_regs > 6
-            movdqu xmm6, [rsp]
-            add rsp, 16*(xmm_num_regs - 6)
-        %endif
-    %endif
-%endmacro
-
-%macro SIGN_EXTENSION 2
-    %ifndef X86_32
-            movsxd %1, %2
-    %endif
-%endmacro
-
-%macro SIGN_EXTENSIONW 2
-    %ifndef X86_32
-            movsx %1, %2
-    %endif
-%endmacro
-
-%macro WELS_EXTERN 1
-    ALIGN 16
-    %ifdef PREFIX
-        global _%1
-        %define %1 _%1
-    %else
-        global %1
-    %endif
-    %1:
-%endmacro
-
-%macro WELS_AbsW 2
-	pxor        %2, %2
-    psubw       %2, %1
-    pmaxsw      %1, %2
-%endmacro
-
-%macro MMX_XSwap  4
-    movq		%4, %2
-    punpckh%1   %4, %3
-    punpckl%1   %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
-    MMX_XSwap wd, %1, %2, %5
-    MMX_XSwap wd, %3, %4, %2
-    MMX_XSwap dq, %1, %3, %4
-    MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
-    movdqa      %4, %2
-    punpckl%1   %2, %3
-    punpckh%1   %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
-    SSE2_XSawp dq,  %1, %2, %5
-    SSE2_XSawp dq,  %3, %4, %2
-    SSE2_XSawp qdq, %1, %3, %4
-    SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
-    SSE2_XSawp wd,  %1, %2, %5
-    SSE2_XSawp wd,  %3, %4, %2
-    SSE2_XSawp dq,  %1, %3, %4
-    SSE2_XSawp dq,  %5, %2, %3
-    SSE2_XSawp qdq, %1, %5, %2
-    SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in:  m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
-	movdqa	%9,	%8
-	SSE2_XSawp bw,  %1, %2, %8
-	SSE2_XSawp bw,  %3, %4, %2
-	SSE2_XSawp bw,  %5, %6, %4
-	movdqa	%6, %9
-	movdqa	%9, %4
-	SSE2_XSawp bw,  %7, %6, %4
-
-	SSE2_XSawp wd,  %1, %3, %6
-	SSE2_XSawp wd,  %8, %2, %3
-	SSE2_XSawp wd,  %5, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %3
-	SSE2_XSawp wd,  %7, %4, %3
-
-	SSE2_XSawp dq,  %1, %5, %4
-	SSE2_XSawp dq,  %6, %2, %5
-	SSE2_XSawp dq,  %8, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %5
-	SSE2_XSawp dq,  %7, %3, %5
-
-	SSE2_XSawp qdq,  %1, %8, %3
-	SSE2_XSawp qdq,  %4, %2, %8
-	SSE2_XSawp qdq,  %6, %7, %2
-	movdqa	%7, %9
-	movdqa	%9, %1
-	SSE2_XSawp qdq,  %7, %5, %1
-	movdqa	%5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
-    movq         %1, %4
-    punpcklbw    %1, %3
-    movq         %2, %5
-    punpcklbw    %2, %3
-    psubw        %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
-	movdqa  %3, %2
-    paddw   %2, %1
-    psubw   %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l
-	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro  SSE2_Copy8Times 2
-		movd	%1, %2
-		punpcklwd %1, %1
-		pshufd	%1,	%1,	0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro  SSE2_Copy16Times 2
-		movd		%1, %2
-		pshuflw		%1, %1, 0
-		punpcklqdq	%1, %1
-		packuswb	%1,	%1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro	WELS_Zero 1
-	pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
-	pcmpeqw %1,%1
-	psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
-	pcmpeqw %1,%1
-	psrlw %1,15
-	packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/codec/common/cpu.cpp
+++ /dev/null
@@ -1,293 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	cpu.cpp
- *
- * \brief	CPU compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-#include <string.h>
-#include <stdio.h>
-#ifdef ANDROID_NDK
-#include <cpu-features.h>
-#endif
-#include "cpu.h"
-#include "cpu_core.h"
-
-
-
-#define    CPU_Vendor_AMD    "AuthenticAMD"
-#define    CPU_Vendor_INTEL  "GenuineIntel"
-#define    CPU_Vendor_CYRIX  "CyrixInstead"
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
-  uint32_t uiCPU = 0;
-  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
-  int32_t  CacheLineSize = 0;
-  int8_t   chVendorName[16] = { 0 };
-  uint32_t uiMaxCpuidLevel = 0;
-
-  if (!WelsCPUIdVerify()) {
-    /* cpuid is not supported in cpu */
-    return 0;
-  }
-
-  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVendorName[0], (uint32_t*)&chVendorName[8], (uint32_t*)&chVendorName[4]);
-  uiMaxCpuidLevel = uiFeatureA;
-  if (uiMaxCpuidLevel == 0) {
-    /* maximum input value for basic cpuid information */
-    return 0;
-  }
-
-  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-  if ((uiFeatureD & 0x00800000) == 0) {
-    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
-    return 0;
-  }
-
-  uiCPU = WELS_CPU_MMX;
-  if (uiFeatureD & 0x02000000) {
-    /* SSE technology is identical to AMD MMX extensions */
-    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
-  }
-  if (uiFeatureD & 0x04000000) {
-    /* SSE2 support here */
-    uiCPU |= WELS_CPU_SSE2;
-  }
-  if (uiFeatureD & 0x00000001) {
-    /* x87 FPU on-chip checking */
-    uiCPU |= WELS_CPU_FPU;
-  }
-  if (uiFeatureD & 0x00008000) {
-    /* CMOV instruction checking */
-    uiCPU |= WELS_CPU_CMOV;
-  }
-  if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) ||
-      (!strcmp((const char*)chVendorName, CPU_Vendor_AMD)) ) {	// confirmed_safe_unsafe_usage
-    if (uiFeatureD & 0x10000000) {
-      /* Multi-Threading checking: contains of multiple logic processors */
-      uiCPU |= WELS_CPU_HTT;
-    }
-  }
-
-  if (uiFeatureC & 0x00000001) {
-    /* SSE3 support here */
-    uiCPU |= WELS_CPU_SSE3;
-  }
-  if (uiFeatureC & 0x00000200) {
-    /* SSSE3 support here */
-    uiCPU |= WELS_CPU_SSSE3;
-  }
-  if (uiFeatureC & 0x00080000) {
-    /* SSE4.1 support here, 45nm Penryn processor */
-    uiCPU |= WELS_CPU_SSE41;
-  }
-  if (uiFeatureC & 0x00100000) {
-    /* SSE4.2 support here, next generation Nehalem processor */
-    uiCPU |= WELS_CPU_SSE42;
-  }
-  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {
-    /* AVX supported */
-    uiCPU |= WELS_CPU_AVX;
-  }
-  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {
-    /* AVX FMA supported */
-    uiCPU |= WELS_CPU_FMA;
-  }
-  if (uiFeatureC & 0x02000000) {
-    /* AES checking */
-    uiCPU |= WELS_CPU_AES;
-  }
-  if (uiFeatureC & 0x00400000) {
-    /* MOVBE checking */
-    uiCPU |= WELS_CPU_MOVBE;
-  }
-
-  if( pNumberOfLogicProcessors != NULL ){
-    if( uiCPU & WELS_CPU_HTT){
-      *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
-    } else {
-      *pNumberOfLogicProcessors = 0;
-    }
-    if( !strcmp((const char*)chVendorName, CPU_Vendor_INTEL) ){
-      if( uiMaxCpuidLevel >= 4 ){
-        uiFeatureC = 0;
-        WelsCPUId(0x4, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-        if( uiFeatureA != 0 ){
-          *pNumberOfLogicProcessors = ((uiFeatureA&0xfc000000)>>26) + 1;
-        }
-      }
-    }
-  }
-
-  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-  if ((!strcmp ((const char*)chVendorName, CPU_Vendor_AMD))
-      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
-    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-    if (uiFeatureD & 0x00400000) {
-      uiCPU |= WELS_CPU_MMXEXT;
-    }
-    if (uiFeatureD & 0x80000000) {
-      uiCPU |= WELS_CPU_3DNOW;
-    }
-  }
-
-  if (!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) {	// confirmed_safe_unsafe_usage
-    int32_t  family, model;
-
-    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
-    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
-
-    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
-      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
-    }
-  }
-
-  // get cache line size
-  if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL))
-      || ! (strcmp ((const char*)chVendorName, CPU_Vendor_CYRIX))) {	// confirmed_safe_unsafe_usage
-    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
-    CacheLineSize = (uiFeatureB & 0xff00) >>
-                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
-    if (CacheLineSize == 128) {
-      uiCPU |= WELS_CPU_CACHELINE_128;
-    } else if (CacheLineSize == 64) {
-      uiCPU |= WELS_CPU_CACHELINE_64;
-    } else if (CacheLineSize == 32) {
-      uiCPU |= WELS_CPU_CACHELINE_32;
-    } else if (CacheLineSize == 16) {
-      uiCPU |= WELS_CPU_CACHELINE_16;
-    }
-  }
-
-  return uiCPU;
-}
-
-
-void WelsCPURestore (const uint32_t kuiCPU) {
-  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
-    WelsEmms();
-  }
-}
-
-#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
-#if defined(ANDROID_NDK)
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
-{
-  uint32_t         uiCPU = 0;
-  AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
-  uint64_t         uiFeatures = 0;
-  cpuFamily = android_getCpuFamily();
-  if (cpuFamily == ANDROID_CPU_FAMILY_ARM)	{
-    uiFeatures = android_getCpuFeatures();
-    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
-      uiCPU |= WELS_CPU_ARMv7;
-    }
-    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
-      uiCPU |= WELS_CPU_VFPv3;
-    }
-    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
-      uiCPU |= WELS_CPU_NEON;
-    }
-  }
-
-  if( pNumberOfLogicProcessors != NULL ){
-    *pNumberOfLogicProcessors = android_getCpuCount();
-  }
-
-  return uiCPU;
-}
-
-#elif defined(__APPLE__)
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
-{
-    uint32_t       uiCPU = 0;
-
-#if defined(__ARM_NEON__)
-    uiCPU |= WELS_CPU_ARMv7;
-    uiCPU |= WELS_CPU_VFPv3;
-    uiCPU |= WELS_CPU_NEON;
-#endif
-    return uiCPU;
-}
-#elif defined(__linux__)
-
-/* Generic arm/linux cpu feature detection */
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
-  FILE *f = fopen("/proc/cpuinfo", "r");
-
-  if (!f)
-    return 0;
-
-  char buf[200];
-  int flags = 0;
-  while (fgets(buf, sizeof(buf), f)) {
-    if (!strncmp(buf, "Features", strlen("Features"))) {
-      if (strstr(buf, " neon "))
-        flags |= WELS_CPU_NEON;
-      if (strstr(buf, " vfpv3 "))
-        flags |= WELS_CPU_VFPv3;
-      break;
-    }
-  }
-  fclose(f);
-  return flags;
-}
-
-#else /* HAVE_NEON enabled but no runtime detection */
-
-/* No runtime feature detection available, but built with HAVE_NEON - assume
- * that NEON and all associated features are available. */
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
-  return WELS_CPU_ARMv7 |
-         WELS_CPU_VFPv3 |
-         WELS_CPU_NEON;
-}
-#endif
-#else /* Neither X86_ASM nor HAVE_NEON */
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
-  return 0;
-}
-
-#endif
-
-
--- a/codec/common/cpu.h
+++ /dev/null
@@ -1,80 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	cpu.h
- *
- * \brief	CPU feature compatibility detection
- *
- * \date	04/29/2009 Created
- *
- *************************************************************************************
- */
-#if !defined(WELS_CPU_DETECTION_H__)
-#define WELS_CPU_DETECTION_H__
-
-#include "typedefs.h"
-#include "cpu_core.h"
-
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(X86_ASM)
-/*
- *	cpuid support verify routine
- *  return 0 if cpuid is not supported by cpu
- */
-int32_t  WelsCPUIdVerify();
-
-void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
-
-int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
-int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
-
-void WelsEmms();
-
-/*
- *	clear FPU registers states for potential float based calculation if support
- */
-void     WelsCPURestore (const uint32_t kuiCPU);
-
-#else
-#define WelsEmms()
-#endif
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif//WELS_CPU_DETECTION_H__
--- a/codec/common/cpu_core.h
+++ /dev/null
@@ -1,85 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	cpu_core.h
- *
- * \brief	cpu core feature detection
- *
- * \date	4/24/2009 Created
- *
- *************************************************************************************
- */
-#if !defined(WELS_CPU_CORE_FEATURE_DETECTION_H__)
-#define WELS_CPU_CORE_FEATURE_DETECTION_H__
-
-/*
- *	WELS CPU feature flags
- */
-#define WELS_CPU_MMX        0x00000001    /* mmx */
-#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
-#define WELS_CPU_SSE        0x00000004    /* sse */
-#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
-#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
-#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
-#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
-#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
-#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
-#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
-#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
-
-/* CPU features application extensive */
-#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
-#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
-#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
-										   physical processor package is capable of supporting more than one logic processor
-										*/
-#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
-										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
-										*/
-#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
-#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
-#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
-
-#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
-#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
-#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
-#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
-
-/* For the android OS */
-#define WELS_CPU_ARMv7      0x000001    /* ARMv7 */
-#define WELS_CPU_VFPv3      0x000002    /* VFPv3 */
-#define WELS_CPU_NEON       0x000004    /* NEON */
-
-/*
- *	Interfaces for CPU core feature detection as below
- */
-
-#endif//WELS_CPU_CORE_FEATURE_DETECTION_H__
--- a/codec/common/cpuid.asm
+++ /dev/null
@@ -1,212 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	cpu_mmx.asm
-;*
-;*  Abstract
-;*		verify cpuid feature support and cpuid detection
-;*
-;*  History
-;*      04/29/2009	Created
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-;******************************************************************************************
-;   int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WELS_EXTERN WelsCPUIdVerify
-    push    r1
-    PUSHRFLAGS
-    PUSHRFLAGS
-
-    pop      r1
-    mov      eax, r1d
-    xor      eax, 00200000h
-    xor      eax, r1d
-    POPRFLAGS
-    pop      r1
-    ret
-
-;****************************************************************************************************
-;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
-;****************************************************************************************************
-%ifdef       WIN64
-
-WELS_EXTERN WelsCPUId
-    push     rbx
-    push     rdx
-
-    mov      eax,     ecx
-    mov      rcx,     [r9]
-    cpuid
-    mov      [r9],    ecx
-    mov      [r8],    ebx
-    mov      rcx,    [rsp + 2*8 + 40]
-    mov      [rcx],   edx
-    pop      rdx
-    mov      [rdx],   eax
-
-    pop      rbx
-    ret
-
-%elifdef     UNIX64
-WELS_EXTERN WelsCPUId
-    push     rbx
-    push     rcx
-    push     rdx
-
-    mov      eax,     edi
-    mov      rcx,     [rcx]
-    cpuid
-    mov      [r8],    edx
-    pop      rdx
-    pop      r8
-    mov      [r8],   ecx
-    mov      [rdx],   ebx
-    mov      [rsi],   eax
-
-    pop      rbx
-    ret
-
-%elifdef     X86_32
-
-WELS_EXTERN WelsCPUId
-    push	ebx
-    push	edi
-
-    mov     eax, [esp+12]	; operating index
-    mov     edi, [esp+24]
-    mov     ecx, [edi]
-    cpuid					; cpuid
-
-    ; processing various information return
-    mov     edi, [esp+16]
-    mov     [edi], eax
-    mov     edi, [esp+20]
-    mov     [edi], ebx
-    mov     edi, [esp+24]
-    mov     [edi], ecx
-    mov     edi, [esp+28]
-    mov     [edi], edx
-
-    pop	    edi
-    pop     ebx
-    ret
-
-%endif
-
-; need call after cpuid=1 and eax, ecx flag got then
-;****************************************************************************************************
-;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WELS_EXTERN WelsCPUSupportAVX
-%ifdef     WIN64
-        mov   eax,    ecx
-        mov   ecx,    edx
-%elifdef   UNIX64
-        mov eax, edi
-        mov ecx, esi
-%else
-        mov eax, [esp+4]
-        mov ecx, [esp+8]
-%endif
-
-        ; refer to detection of AVX addressed in INTEL AVX manual document
-        and ecx, 018000000H
-        cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
-        jne avx_not_supported
-        ; processor supports AVX instructions and XGETBV is enabled by OS
-        mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
-        XGETBV                                  ; result in EDX:EAX
-        and eax, 06H
-        cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
-        jne avx_not_supported
-        mov eax, 1
-        ret
-avx_not_supported:
-        mov eax, 0
-        ret
-
-
-; need call after cpuid=1 and eax, ecx flag got then
-;****************************************************************************************************
-;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WELS_EXTERN  WelsCPUSupportFMA
-%ifdef     WIN64
-        mov   eax,   ecx
-        mov   ecx,   edx
-%elifdef   UNIX64
-        mov   eax,   edi
-        mov   ecx,   esi
-%else
-	mov eax, [esp+4]
-	mov ecx, [esp+8]
-%endif
-	; refer to detection of FMA addressed in INTEL AVX manual document
-	and ecx, 018001000H
-	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
-	jne fma_not_supported
-	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
-	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
-	XGETBV					; result in EDX:EAX
-	and eax, 06H
-	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
-	jne fma_not_supported
-	mov eax, 1
-	ret
-fma_not_supported:
-	mov eax, 0
-	ret
-
-;******************************************************************************************
-;   void WelsEmms()
-;******************************************************************************************
-WELS_EXTERN WelsEmms
-	emms	; empty mmx technology states
-	ret
-
--- a/codec/common/crt_util_safe_x.cpp
+++ /dev/null
@@ -1,256 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	crt_utils_safe_x.cpp
- *
- * \brief	common tool/function utilization
- *
- * \date	03/10/2009 Created
- *
- *************************************************************************************
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <math.h>
-#include <time.h>
-#if defined(_WIN32)
-#include <windows.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
-#ifndef _MSC_VER
-#include <sys/time.h>
-#endif //!_MSC_VER
-#else
-#include <sys/time.h>
-#include <sys/timeb.h>
-#endif //_WIN32
-
-#include "macros.h"
-#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
-
-#if  defined(_WIN32) && defined(_MSC_VER)
-
-#if  defined(_MSC_VER) && (_MSC_VER>=1500)
-
-int32_t WelsSnprintf (char* pBuffer,  int32_t iSizeOfBuffer, const char* kpFormat, ...) {
-  va_list  pArgPtr;
-  int32_t  iRc;
-
-  va_start (pArgPtr, kpFormat);
-
-  iRc = vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
-
-  va_end (pArgPtr);
-
-  return iRc;
-}
-
-char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
-  strncpy_s (pDest, iSizeInBytes, kpSrc, _TRUNCATE);
-
-  return pDest;
-}
-
-int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
-  return vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
-}
-
-WelsFileHandle* WelsFopen (const char* kpFilename,  const char* kpMode) {
-  WelsFileHandle* pFp = NULL;
-  if (fopen_s (&pFp, kpFilename, kpMode) != 0) {
-    return NULL;
-  }
-
-  return pFp;
-}
-
-int32_t WelsFclose (WelsFileHandle* pFp) {
-  return fclose (pFp);
-}
-
-int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
-  return _ftime_s (pTp);
-}
-
-int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
-  struct tm   sTimeNow;
-  int32_t iRc;
-
-  localtime_s (&sTimeNow, &kpTp->time);
-
-  iRc = strftime (pBuffer, iSize, kpFormat, &sTimeNow);
-  if (iRc == 0)
-      pBuffer[0] = '\0';
-  return iRc;
-}
-
-#else
-
-int32_t WelsSnprintf (char* pBuffer,  int32_t iSizeOfBuffer, const char* kpFormat, ...) {
-  va_list pArgPtr;
-  int32_t iRc;
-
-  va_start (pArgPtr, kpFormat);
-
-  iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
-  if (iRc < 0)
-    pBuffer[iSizeOfBuffer - 1] = '\0';
-
-  va_end (pArgPtr);
-
-  return iRc;
-}
-
-char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
-  strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
-  pDest[iSizeInBytes - 1] = '\0';
-
-  return pDest;
-}
-
-int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
-  int32_t iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
-  if (iRc < 0)
-    pBuffer[iSizeOfBuffer - 1] = '\0';
-  return iRc;
-}
-
-
-WelsFileHandle* WelsFopen (const char* kpFilename,  const char* kpMode) {
-  return fopen (kpFilename, kpMode);
-}
-
-int32_t WelsFclose (WelsFileHandle* pFp) {
-  return fclose (pFp);
-}
-
-int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
-  _ftime (pTp);
-  return 0;
-}
-
-int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
-  struct tm*   pTnow;
-  int32_t iRc;
-
-  pTnow = localtime (&kpTp->time);
-
-  iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
-  if (iRc == 0)
-      pBuffer[0] = '\0';
-  return iRc;
-}
-
-
-#endif // _MSC_VER
-
-#else  //GCC
-
-int32_t WelsSnprintf (char* pBuffer,  int32_t iSizeOfBuffer, const char* kpFormat, ...) {
-  va_list pArgPtr;
-  int32_t iRc;
-
-  va_start (pArgPtr, kpFormat);
-
-  iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
-
-  va_end (pArgPtr);
-
-  return iRc;
-}
-
-char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
-  strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
-  pDest[iSizeInBytes - 1] = '\0';
-  return pDest;
-}
-
-int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
-  return vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
-}
-
-WelsFileHandle* WelsFopen (const char* kpFilename,  const char* kpMode) {
-  return fopen (kpFilename, kpMode);
-}
-
-int32_t WelsFclose (WelsFileHandle*   pFp) {
-  return fclose (pFp);
-}
-
-int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
-  struct timeval  sTv;
-
-  if (gettimeofday (&sTv, NULL)) {
-    return -1;
-  }
-
-  pTp->time = sTv.tv_sec;
-  pTp->millitm = (uint16_t)sTv.tv_usec / 1000;
-
-  return 0;
-}
-
-int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
-  struct tm*   pTnow;
-  int32_t iRc;
-
-  pTnow = localtime (&kpTp->time);
-
-  iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
-  if (iRc == 0)
-      pBuffer[0] = '\0';
-  return iRc;
-}
-
-#endif
-
-
-char* WelsStrcat (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
-    int32_t iCurLen = strlen(pDest);
-    return WelsStrncpy(pDest + iCurLen, iSizeInBytes - iCurLen, kpSrc);
-}
-
-int32_t WelsFwrite (const void* kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp) {
-  return fwrite (kpBuffer, iSize, iCount, pFp);
-}
-
-uint16_t WelsGetMillisecond (const SWelsTime* kpTp) {
-  return kpTp->millitm;
-}
-
-int32_t WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin) {
-  return fseek(fp, offset, origin);
-}
-
-int32_t WelsFflush (WelsFileHandle* pFp) {
-  return fflush (pFp);
-}
--- a/codec/common/crt_util_safe_x.h
+++ /dev/null
@@ -1,99 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2010-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	crt_util_safe_x.h
- *
- * \brief	Safe CRT like util for cross platfroms support
- *
- * \date	06/04/2010 Created
- *
- *************************************************************************************
- */
-#ifndef WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
-#define WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <math.h>
-#include <time.h>
-
-#if defined(_WIN32)
-#include <windows.h>
-#include <sys/types.h>
-#include <sys/timeb.h>
-#else
-#include <sys/timeb.h>
-#include <sys/time.h>
-#include "typedefs.h"
-#endif//_WIN32
-
-#include "typedefs.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define     WELS_FILE_SEEK_SET           SEEK_SET
-#define     WELS_FILE_SEEK_CUR           SEEK_CUR
-#define     WESL_FILE_SEEK_END           SEEK_END
-
-typedef      FILE  WelsFileHandle;
-
-#ifdef _WIN32
-typedef      struct _timeb     SWelsTime;
-#else
-typedef      struct timeb      SWelsTime;
-#endif
-
-int32_t   WelsSnprintf (char* buffer,  int32_t sizeOfBuffer,  const char* format, ...);
-char*   WelsStrncpy (char* dest, int32_t sizeInBytes, const char* src);
-char*   WelsStrcat (char* dest, int32_t sizeInBytes, const char* src);
-int32_t   WelsVsnprintf (char* buffer, int32_t sizeOfBuffer, const char* format, va_list argptr);
-
-WelsFileHandle*        WelsFopen (const char* filename,  const char* mode);
-int32_t                WelsFclose (WelsFileHandle*   fp);
-int32_t                WelsFread (void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
-int32_t                WelsFwrite (const void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
-int32_t                WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin);
-int32_t                WelsFflush (WelsFileHandle* fp);
-
-int32_t                WelsGetTimeOfDay (SWelsTime* tp);
-int32_t                WelsStrftime (char* buffer, int32_t size, const char* format, const SWelsTime* tp);
-uint16_t               WelsGetMillisecond (const SWelsTime* tp);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif//WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
--- a/codec/common/deblock.asm
+++ /dev/null
@@ -1,5278 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-SECTION .rodata align=16
-
-ALIGN   16
-FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4
-
-
-SECTION .text
-
-%ifdef  WIN64
-
-
-WELS_EXTERN   DeblockLumaLt4V_ssse3
-  push        rbp
-  mov         r11,[rsp + 16 + 20h]  ; pTC
-  PUSH_XMM 16
-  sub         rsp,1B0h
-  lea         rbp,[rsp+20h]
-  movd        xmm4,r8d
-  movd        xmm2,r9d
-  mov         qword [rbp+180h],r12
-  mov         r10,rcx
-  movsxd      r12,edx
-  add         edx,edx
-  movsxd      rdx,edx
-  sub         r10,r12
-  movsx       r8d,byte [r11]
-  pxor        xmm3,xmm3
-  punpcklwd   xmm2,xmm2
-  movaps      [rbp+50h],xmm14
-  lea         rax,[r12+r12*2]
-  movdqa      xmm14,[rdx+rcx]
-  neg         rax
-  pshufd      xmm0,xmm2,0
-  movd        xmm2,r8d
-  movsx       edx,byte [r11+1]
-  movsx       r8d,byte [r11+2]
-  movsx       r11d,byte [r11+3]
-  movaps      [rbp+70h],xmm12
-  movd        xmm1,edx
-  movaps      [rbp+80h],xmm11
-  movd        xmm12,r8d
-  movd        xmm11,r11d
-  movdqa      xmm5, [rax+rcx]
-  lea         rax,[r12+r12]
-  punpcklwd   xmm12,xmm12
-  neg         rax
-  punpcklwd   xmm11,xmm11
-  movaps      [rbp],xmm8
-  movdqa      xmm8, [r10]
-  punpcklwd   xmm2,xmm2
-  punpcklwd   xmm1,xmm1
-  punpcklqdq  xmm12,xmm12
-  punpcklqdq  xmm11,xmm11
-  punpcklqdq  xmm2,xmm2
-  punpcklqdq  xmm1,xmm1
-  shufps      xmm12,xmm11,88h
-  movdqa      xmm11,xmm8
-  movaps      [rbp+30h],xmm9
-  movdqa      xmm9,[rcx]
-  shufps      xmm2,xmm1,88h
-  movdqa      xmm1,xmm5
-  punpcklbw   xmm11,xmm3
-  movaps      [rbp+20h],xmm6
-  movaps      [rbp+60h],xmm13
-  movdqa      xmm13,xmm11
-  movaps      [rbp+90h],xmm10
-  movdqa      xmm10,xmm9
-  movdqa      xmm6,[rax+rcx]
-  punpcklbw   xmm1,xmm3
-  movaps      [rbp+0A0h],xmm12
-  psubw       xmm13,xmm1
-  movaps      [rbp+40h],xmm15
-  movdqa      xmm15,xmm14
-  movaps      [rbp+10h],xmm7
-  movdqa      xmm7,xmm6
-  punpcklbw   xmm10,xmm3
-  movdqa      xmm12,[r12+rcx]
-  punpcklbw   xmm7,xmm3
-  punpcklbw   xmm12,xmm3
-  punpcklbw   xmm15,xmm3
-  pabsw       xmm3,xmm13
-  movdqa      xmm13,xmm10
-  psubw       xmm13,xmm15
-  movdqa      [rbp+0F0h],xmm15
-  pabsw       xmm15,xmm13
-  movdqa      xmm13,xmm11
-  movdqa      [rbp+0B0h],xmm1
-  movdqa      xmm1,xmm0
-  pavgw       xmm13,xmm10
-  pcmpgtw     xmm1,xmm3
-  movdqa      [rbp+120h],xmm13
-  movaps      xmm13,xmm2
-  punpcklwd   xmm4,xmm4
-  movdqa      xmm3,xmm0
-  movdqa      [rbp+100h],xmm1
-  psubw       xmm13,xmm1
-  movdqa      xmm1,xmm10
-  pcmpgtw     xmm3,xmm15
-  pshufd      xmm4,xmm4,0
-  psubw       xmm1,xmm11
-  movdqa      [rbp+0D0h],xmm10
-  psubw       xmm13,xmm3
-  movdqa      [rbp+110h],xmm3
-  pabsw       xmm15,xmm1
-  movdqa      xmm3,xmm4
-  psubw       xmm10,xmm12
-  pcmpgtw     xmm3,xmm15
-  pabsw       xmm15,xmm10
-  movdqa      xmm10,xmm0
-  psllw       xmm1,2
-  movdqa      [rbp+0C0h],xmm11
-  psubw       xmm11,xmm7
-  pcmpgtw     xmm10,xmm15
-  pabsw       xmm11,xmm11
-  movdqa      xmm15,xmm0
-  pand        xmm3,xmm10
-  pcmpgtw     xmm15,xmm11
-  movaps      xmm11,xmm2
-  pxor        xmm10,xmm10
-  pand        xmm3,xmm15
-  pcmpgtw     xmm11,xmm10
-  pcmpeqw     xmm10,xmm2
-  por         xmm11,xmm10
-  pand        xmm3,xmm11
-  movdqa      xmm11,xmm7
-  psubw       xmm11,xmm12
-  pxor        xmm15,xmm15
-  paddw       xmm11,xmm1
-  psubw       xmm15,xmm13
-  movdqa      [rbp+0E0h],xmm12
-  paddw       xmm11,[FOUR_16B_SSE2]
-  pxor        xmm12,xmm12
-  psraw       xmm11,3
-  punpckhbw   xmm8,xmm12
-  pmaxsw      xmm15,xmm11
-  punpckhbw   xmm5,xmm12
-  movdqa      xmm11,xmm8
-  pminsw      xmm13,xmm15
-  psubw       xmm11,xmm5
-  punpckhbw   xmm9,xmm12
-  pand        xmm13,xmm3
-  movdqa      [rbp+130h],xmm13
-  pabsw       xmm13,xmm11
-  punpckhbw   xmm14,xmm12
-  movdqa      xmm11,xmm9
-  psubw       xmm11,xmm14
-  movdqa      xmm15,xmm0
-  movdqa      [rbp+140h],xmm14
-  pabsw       xmm14,xmm11
-  movdqa      xmm11,xmm8
-  pcmpgtw     xmm15,xmm14
-  movdqa      xmm1,[r12+rcx]
-  pavgw       xmm11,xmm9
-  movdqa      [rbp+170h],xmm11
-  movdqa      xmm10,xmm9
-  punpckhbw   xmm6,xmm12
-  psubw       xmm10,xmm8
-  punpckhbw   xmm1,xmm12
-  movdqa      xmm12,xmm0
-  movaps      xmm11,[rbp+0A0h]
-  pcmpgtw     xmm12,xmm13
-  movaps      xmm13,xmm11
-  psubw       xmm13,xmm12
-  movdqa      [rbp+160h],xmm15
-  psubw       xmm13,xmm15
-  movdqa      xmm15,xmm9
-  psubw       xmm15,xmm1
-  movdqa      [rbp+150h],xmm12
-  pabsw       xmm12,xmm10
-  pabsw       xmm14,xmm15
-  movdqa      xmm15,xmm8
-  pcmpgtw     xmm4,xmm12
-  movdqa      xmm12,xmm0
-  psubw       xmm15,xmm6
-  pcmpgtw     xmm12,xmm14
-  pabsw       xmm14,xmm15
-  psllw       xmm10,2
-  pcmpgtw     xmm0,xmm14
-  movdqa      xmm14,xmm6
-  psubw       xmm14,xmm1
-  pand        xmm4,xmm12
-  paddw       xmm14,xmm10
-  pand        xmm4,xmm0
-  paddw       xmm14,[FOUR_16B_SSE2]
-  pxor        xmm15,xmm15
-  movaps      xmm12,xmm11
-  psubw       xmm15,xmm13
-  pxor        xmm0,xmm0
-  psraw       xmm14,3
-  pcmpgtw     xmm12,xmm0
-  pcmpeqw     xmm0,xmm11
-  pmaxsw      xmm15,xmm14
-  por         xmm12,xmm0
-  movdqa      xmm0,[rbp+120h]
-  pminsw      xmm13,xmm15
-  movdqa      xmm15,[rbp+0B0h]
-  movdqa      xmm10,xmm7
-  pand        xmm4,xmm12
-  paddw       xmm15,xmm0
-  pxor        xmm12,xmm12
-  paddw       xmm10,xmm7
-  movdqa      xmm14,xmm12
-  psubw       xmm15,xmm10
-  psubw       xmm14,xmm2
-  psraw       xmm15,1
-  pmaxsw      xmm15,xmm14
-  movdqa      xmm10,xmm6
-  pminsw      xmm15,xmm2
-  paddw       xmm10,xmm6
-  pand        xmm15,xmm3
-  psubw       xmm12,xmm11
-  pand        xmm15,[rbp+100h]
-  pand        xmm13,xmm4
-  paddw       xmm7,xmm15
-  paddw       xmm8,xmm13
-  movdqa      xmm15,[rbp+170h]
-  psubw       xmm9,xmm13
-  paddw       xmm5,xmm15
-  psubw       xmm5,xmm10
-  psraw       xmm5,1
-  pmaxsw      xmm5,xmm12
-  pminsw      xmm5,xmm11
-  pand        xmm5,xmm4
-  pand        xmm5,[rbp+150h]
-  paddw       xmm6,xmm5
-  movdqa      xmm5,[rbp+0C0h]
-  packuswb    xmm7,xmm6
-  movdqa      xmm6,[rbp+130h]
-  paddw       xmm5,xmm6
-  packuswb    xmm5,xmm8
-  movdqa      xmm8,[rbp+0D0h]
-  psubw       xmm8,xmm6
-  movdqa      xmm6,[rbp+0F0h]
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[rbp+0E0h]
-  packuswb    xmm8,xmm9
-  movdqa      xmm9,xmm0
-  paddw       xmm9,xmm0
-  psubw       xmm6,xmm9
-  psraw       xmm6,1
-  pmaxsw      xmm14,xmm6
-  pminsw      xmm2,xmm14
-  pand        xmm2,xmm3
-  pand        xmm2,[rbp+110h]
-  paddw       xmm0,xmm2
-  movdqa      xmm2,[rbp+140h]
-  paddw       xmm2,xmm15
-  movdqa      xmm15,xmm1
-  paddw       xmm15,xmm1
-  psubw       xmm2,xmm15
-  psraw       xmm2,1
-  pmaxsw      xmm12,xmm2
-  pminsw      xmm11,xmm12
-  pand        xmm11,xmm4
-  pand        xmm11,[rbp+160h]
-  paddw       xmm1,xmm11
-  movdqa      [rax+rcx],xmm7
-  movdqa      [r10],xmm5
-  packuswb    xmm0,xmm1
-  movdqa      [rcx],xmm8
-  movdqa      [r12+rcx],xmm0
-  mov         r12,qword [rbp+180h]
-  lea         rsp,[rbp+190h]
-  POP_XMM
-  pop         rbp
-  ret
-
-
-WELS_EXTERN   DeblockLumaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        rsi
-  push        rdi
-  sub         rsp,1D8h
-  movaps      [rax-38h],xmm6
-  movaps      [rax-48h],xmm7
-  movaps      [rax-58h],xmm8
-  pxor        xmm1,xmm1
-  movsxd      r10,edx
-  mov         rbp,rcx
-  mov         r11d,r8d
-  mov         rdx,rcx
-  mov         rdi,rbp
-  mov         rbx,rbp
-  movdqa      xmm5,[rbp]
-  movaps      [rax-68h],xmm9
-  movaps      [rax-78h],xmm10
-  punpcklbw   xmm5,xmm1
-  movaps      [rax-88h],xmm11
-  movaps      [rax-98h],xmm12
-  movaps      [rax-0A8h],xmm13
-  movaps      [rax-0B8h],xmm14
-  movdqa      xmm14,[r10+rbp]
-  movaps      [rax-0C8h],xmm15
-  lea         eax,[r10*4]
-  movsxd      r8,eax
-  lea         eax,[r10+r10*2]
-  movsxd      rcx,eax
-  lea         eax,[r10+r10]
-  sub         rdx,r8
-  punpcklbw   xmm14,xmm1
-  movdqa      [rsp+90h],xmm5
-  movdqa      [rsp+30h],xmm14
-  movsxd      rsi,eax
-  movsx       eax,r11w
-  sub         rdi,rcx
-  sub         rbx,rsi
-  mov         r8,rbp
-  sub         r8,r10
-  movd        xmm0,eax
-  movsx       eax,r9w
-  movdqa      xmm12,[rdi]
-  movdqa      xmm6, [rsi+rbp]
-  movdqa      xmm13,[rbx]
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm6,xmm1
-  movdqa      xmm8,[r8]
-  movd        xmm0,eax
-  movdqa      xmm10,xmm11
-  mov         eax,2
-  punpcklbw   xmm8,xmm1
-  punpcklbw   xmm12,xmm1
-  cwde
-  punpcklwd   xmm0,xmm0
-  psraw       xmm10,2
-  movdqa      xmm1,xmm8
-  movdqa      [rsp+0F0h],xmm13
-  movdqa      [rsp+0B0h],xmm8
-  pshufd      xmm7,xmm0,0
-  psubw       xmm1,xmm13
-  movdqa      xmm0,xmm5
-  movdqa      xmm4,xmm7
-  movdqa      xmm2,xmm7
-  psubw       xmm0,xmm8
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm5
-  movdqa      [rsp+40h],xmm7
-  movdqa      [rsp+60h],xmm6
-  pcmpgtw     xmm4,xmm0
-  psubw       xmm1,xmm14
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm4,xmm2
-  movdqa      xmm0,xmm11
-  pcmpgtw     xmm0,xmm3
-  pand        xmm4,xmm0
-  movd        xmm0,eax
-  movdqa      [rsp+20h],xmm4
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm2,xmm0,0
-  paddw       xmm10,xmm2
-  movdqa      [rsp+0A0h],xmm2
-  movdqa      xmm15,xmm7
-  pxor        xmm4,xmm4
-  movdqa      xmm0,xmm8
-  psubw       xmm0,xmm12
-  mov         eax,4
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm10
-  cwde
-  pcmpgtw     xmm15,xmm0
-  pcmpgtw     xmm1,xmm3
-  movdqa      xmm3,xmm7
-  movdqa      xmm7,[rdx]
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm6
-  pand        xmm15,xmm1
-  punpcklbw   xmm7,xmm4
-  movdqa      xmm9,xmm15
-  pabsw       xmm0,xmm0
-  psllw       xmm7,1
-  pandn       xmm9,xmm12
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm7,xmm12
-  movd        xmm0,eax
-  pand        xmm3,xmm1
-  paddw       xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  paddw       xmm7,xmm12
-  pshufd      xmm1,xmm0,0
-  paddw       xmm7,xmm13
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm6
-  paddw       xmm7,xmm8
-  movdqa      [rsp+70h],xmm1
-  paddw       xmm7,xmm5
-  movdqa      [rsp+120h],xmm0
-  movdqa      xmm0,[rcx+rbp]
-  punpcklbw   xmm0,xmm4
-  paddw       xmm7,xmm1
-  movdqa      xmm4,xmm15
-  psllw       xmm0,1
-  psraw       xmm7,3
-  paddw       xmm0,xmm6
-  pand        xmm7,xmm15
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm14
-  movdqa      xmm6,xmm15
-  paddw       xmm0,xmm5
-  pandn       xmm6,xmm13
-  paddw       xmm0,xmm8
-  paddw       xmm0,xmm1
-  psraw       xmm0,3
-  movdqa      xmm1,xmm12
-  paddw       xmm1,xmm13
-  pand        xmm0,xmm3
-  movdqa      [rsp+100h],xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,xmm5
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm3
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pandn       xmm0,xmm14
-  pand        xmm4,xmm1
-  movdqa      [rsp+0E0h],xmm0
-  movdqa      xmm0,xmm5
-  paddw       xmm0,xmm8
-  movdqa      xmm1,[rsp+60h]
-  paddw       xmm1,xmm14
-  movdqa      xmm14,xmm3
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,[rsp+30h]
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pand        xmm14,xmm1
-  movdqa      xmm1,xmm13
-  paddw       xmm1,xmm13
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  movdqa      xmm0,[rsp+30h]
-  movdqa      xmm2,xmm13
-  movdqa      xmm5,xmm15
-  paddw       xmm0,[rsp+70h]
-  pandn       xmm5,xmm1
-  paddw       xmm2,xmm8
-  movdqa      xmm8,[rsp+90h]
-  movdqa      xmm1,xmm12
-  paddw       xmm2,xmm8
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,xmm8
-  movdqa      xmm8,xmm3
-  movdqa      xmm2,[rsp+30h]
-  paddw       xmm0,xmm13
-  psraw       xmm1,3
-  pand        xmm15,xmm1
-  movdqa      xmm1,xmm2
-  paddw       xmm1,xmm2
-  paddw       xmm2,[rsp+90h]
-  paddw       xmm2,[rsp+0B0h]
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  movdqa      xmm13,[r8]
-  paddw       xmm0, [rsp+70h]
-  paddw       xmm1, [rsp+0A0h]
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  psraw       xmm1,2
-  movdqa      xmm0, [rdi]
-  pandn       xmm8,xmm1
-  movdqa      xmm1, [rsp+60h]
-  paddw       xmm1,xmm2
-  movdqa      xmm2, [rbx]
-  psraw       xmm1,3
-  pand        xmm3,xmm1
-  movdqa      xmm1, [rbp]
-  movdqa      [rsp+0D0h],xmm3
-  pxor        xmm3,xmm3
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm1,xmm3
-  punpckhbw   xmm13,xmm3
-  movdqa      [rsp+0C0h],xmm0
-  movdqa      xmm0,[r10+rbp]
-  movdqa      [rsp],xmm1
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm2,xmm3
-  movdqa      [rsp+80h],xmm0
-  movdqa      xmm0,[rsi+rbp]
-  movdqa      [rsp+10h],xmm13
-  punpckhbw   xmm0,xmm3
-  movdqa      [rsp+50h],xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm1,xmm13
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm2
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,[rsp]
-  movdqa      xmm13,[rsp+40h]
-  movdqa      [rsp+110h],xmm2
-  psubw       xmm1, [rsp+80h]
-  pcmpgtw     xmm13,xmm0
-  pcmpgtw     xmm11,xmm3
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm10,xmm3
-  movdqa      xmm1, [rsp+40h]
-  movdqa      xmm2,xmm1
-  movdqa      xmm3,xmm1
-  pcmpgtw     xmm2,xmm0
-  movdqa      xmm0, [rsp+10h]
-  pand        xmm13,xmm2
-  pand        xmm13,xmm11
-  movdqa      xmm11,[rsp+0C0h]
-  psubw       xmm0,xmm11
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm3,xmm0
-  pand        xmm3,xmm10
-  movdqa      xmm0,[rsp]
-  psubw       xmm0,[rsp+50h]
-  movdqa      xmm2,[rdx]
-  pabsw       xmm0,xmm0
-  por         xmm7,xmm9
-  movdqa      xmm9,[rsp+20h]
-  pcmpgtw     xmm1,xmm0
-  pand        xmm9,xmm7
-  movdqa      xmm7,[rsp+20h]
-  movdqa      xmm0,xmm7
-  pandn       xmm0,xmm12
-  movdqa      xmm12,[rsp+110h]
-  pand        xmm1,xmm10
-  movdqa      xmm10,[rsp+70h]
-  movdqa      [rsp+40h],xmm1
-  movdqa      xmm1,xmm13
-  por         xmm9,xmm0
-  pxor        xmm0,xmm0
-  por         xmm4,xmm6
-  movdqa      xmm6,xmm7
-  punpckhbw   xmm2,xmm0
-  por         xmm15,xmm5
-  movdqa      xmm5,[rsp+20h]
-  movdqa      xmm0,xmm3
-  psllw       xmm2,1
-  pandn       xmm0,xmm11
-  pand        xmm6,xmm4
-  movdqa      xmm4,[rsp]
-  paddw       xmm2,xmm11
-  pand        xmm5,xmm15
-  movdqa      xmm15,[rsp+20h]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm12
-  paddw       xmm2,[rsp+10h]
-  paddw       xmm2,[rsp]
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  pand        xmm2,xmm3
-  por         xmm2,xmm0
-  pand        xmm1,xmm2
-  movdqa      xmm0,xmm13
-  movdqa      xmm2,xmm11
-  pandn       xmm0,xmm11
-  paddw       xmm2,xmm12
-  por         xmm1,xmm0
-  packuswb    xmm9,xmm1
-  movdqa      xmm0,xmm7
-  movdqa      xmm7,[rsp+0A0h]
-  pandn       xmm0,[rsp+0F0h]
-  movdqa      xmm1,xmm3
-  por         xmm6,xmm0
-  movdqa      xmm0,[rsp+10h]
-  paddw       xmm0,xmm4
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm12
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  pandn       xmm0,xmm12
-  movdqa      xmm1,xmm12
-  paddw       xmm1,[rsp+10h]
-  por         xmm2,xmm0
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+0B0h]
-  paddw       xmm1,xmm4
-  packuswb    xmm6,xmm2
-  movdqa      xmm2,xmm3
-  psllw       xmm1,1
-  por         xmm5,xmm0
-  movdqa      xmm0,[rsp+80h]
-  paddw       xmm0,xmm10
-  paddw       xmm1,xmm0
-  paddw       xmm11,xmm1
-  psraw       xmm11,3
-  movdqa      xmm1,xmm12
-  pand        xmm2,xmm11
-  paddw       xmm1,xmm12
-  movdqa      xmm11,[rsp+80h]
-  movdqa      xmm0, [rsp+10h]
-  por         xmm14,[rsp+0E0h]
-  paddw       xmm0,xmm11
-  movdqa      xmm4,xmm15
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  paddw       xmm1,xmm7
-  psraw       xmm1,2
-  pandn       xmm3,xmm1
-  por         xmm2,xmm3
-  movdqa      xmm1,xmm13
-  movdqa      xmm3,[rsp+10h]
-  pandn       xmm0,xmm3
-  pand        xmm1,xmm2
-  movdqa      xmm2,xmm11
-  paddw       xmm2,[rsp]
-  por         xmm1,xmm0
-  movdqa      xmm0,[rsp+0D0h]
-  por         xmm0,xmm8
-  paddw       xmm2,xmm3
-  packuswb    xmm5,xmm1
-  movdqa      xmm8,[rsp+40h]
-  movdqa      xmm1,[rsp+50h]
-  movdqa      xmm3,xmm8
-  pand        xmm4,xmm0
-  psllw       xmm2,1
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+90h]
-  por         xmm4,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm10
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,[rsp]
-  movdqa      xmm2,xmm11
-  paddw       xmm0,xmm12
-  movdqa      xmm12,[rsp]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm0
-  psraw       xmm1,3
-  movdqa      xmm0,xmm8
-  pand        xmm3,xmm1
-  paddw       xmm2,xmm7
-  movdqa      xmm1,xmm13
-  psraw       xmm2,2
-  pandn       xmm0,xmm2
-  por         xmm3,xmm0
-  movdqa      xmm2,[rsp+50h]
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm3
-  paddw       xmm2,xmm11
-  movdqa      xmm3,xmm15
-  por         xmm1,xmm0
-  pand        xmm3,xmm14
-  movdqa      xmm14,[rsp+10h]
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+30h]
-  packuswb    xmm4,xmm1
-  movdqa      xmm1,xmm8
-  por         xmm3,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm14
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm8
-  pandn       xmm0,xmm11
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm11
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm15
-  por         xmm2,xmm0
-  packuswb    xmm3,xmm2
-  movdqa      xmm0,[rsp+100h]
-  por         xmm0,[rsp+120h]
-  pand        xmm1,xmm0
-  movdqa      xmm2,[rcx+rbp]
-  movdqa      xmm7,[rsp+50h]
-  pandn       xmm15,[rsp+60h]
-  lea         r11,[rsp+1D8h]
-  pxor        xmm0,xmm0
-  por         xmm1,xmm15
-  movaps      xmm15,[r11-0A8h]
-  movdqa      [rdi],xmm9
-  movaps      xmm9,[r11-48h]
-  punpckhbw   xmm2,xmm0
-  psllw       xmm2,1
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm7
-  movdqa      [rbx],xmm6
-  movaps      xmm6,[r11-18h]
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm11
-  movaps      xmm11,[r11-68h]
-  paddw       xmm2,xmm12
-  movaps      xmm12,[r11-78h]
-  paddw       xmm2,xmm14
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  movaps      xmm10,[r11-58h]
-  movaps      xmm14,[r11-98h]
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm8
-  pandn       xmm8,xmm7
-  pandn       xmm13,xmm7
-  por         xmm2,xmm8
-  movaps      xmm7,[r11-28h]
-  movaps      xmm8,[r11-38h]
-  movdqa      [r8],xmm5
-  pand        xmm0,xmm2
-  por         xmm0,xmm13
-  packuswb    xmm1,xmm0
-  movaps      xmm13,[r11-88h]
-  movdqa      [rbp],xmm4
-  movdqa      [r10+rbp],xmm3
-  movdqa      [rsi+rbp],xmm1
-  mov         rsp,r11
-  pop         rdi
-  pop         rsi
-  pop         rbp
-  pop         rbx
-  ret
-
-
-WELS_EXTERN  DeblockChromaLt4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rdi
-  PUSH_XMM 16
-  sub         rsp,0C8h
-  mov         r10,qword [rax + 30h]  ; pTC
-  pxor        xmm1,xmm1
-  mov         rbx,rcx
-  movsxd      r11,r8d
-  movsx       ecx,byte [r10]
-  movsx       r8d,byte [r10+2]
-  mov         rdi,rdx
-  movq        xmm2,[rbx]
-  movq        xmm9,[r11+rbx]
-  movsx       edx,byte [r10+1]
-  mov         word [rsp+2],cx
-  mov         word [rsp],cx
-  movsx       eax,byte [r10+3]
-  mov         word [rsp+6],dx
-  mov         word [rsp+4],dx
-  movdqa      xmm11,xmm1
-  mov         word [rsp+0Eh],ax
-  mov         word [rsp+0Ch],ax
-  lea         eax,[r11+r11]
-  movsxd      rcx,eax
-  mov         rax,rbx
-  mov         rdx,rdi
-  sub         rax,rcx
-  mov         word [rsp+0Ah],r8w
-  mov         word [rsp+8],r8w
-  movdqa      xmm6,[rsp]
-  movdqa      xmm7,xmm6
-  movq        xmm13, [rax]
-  mov         rax,rdi
-  sub         rax,rcx
-  mov         rcx,rbx
-  pcmpgtw     xmm7,xmm1
-  psubw       xmm11,xmm6
-  sub         rcx,r11
-  sub         rdx,r11
-  movq        xmm0,[rax]
-  movsx       eax,r9w
-  movq        xmm15,[rcx]
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rdx]
-  movdqa      xmm4,xmm13
-  punpcklqdq  xmm15,xmm0
-  movq        xmm0, [rdi]
-  punpcklbw   xmm4,xmm1
-  movdqa      xmm12,xmm15
-  punpcklqdq  xmm2,xmm0
-  movq        xmm0, [r11+rdi]
-  punpcklbw   xmm12,xmm1
-  movdqa      xmm14,xmm2
-  punpcklqdq  xmm9,xmm0
-  punpckhbw   xmm2,xmm1
-  punpcklbw   xmm14,xmm1
-  movd        xmm0,eax
-  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
-  punpckhbw   xmm13,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm9
-  movdqa      [rsp+10h],xmm2
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm9,xmm1
-  punpcklbw   xmm3,xmm1
-  movdqa      xmm1,xmm14
-  pshufd      xmm10,xmm0,0
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm8,xmm0,0
-  movd        xmm0,eax
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  psubw       xmm1,xmm12
-  movdqa      xmm2,xmm10
-  lea         r11,[rsp+0C8h]
-  psllw       xmm1,2
-  movdqa      xmm0,xmm4
-  psubw       xmm4,xmm12
-  psubw       xmm0,xmm3
-  psubw       xmm3,xmm14
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm11
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm12
-  psubw       xmm0,xmm14
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  movdqa      xmm3,[rsp]
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm9
-  psubw       xmm13,xmm15
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  paddw       xmm12,xmm6
-  psubw       xmm14,xmm6
-  movdqa      xmm2,[rsp+10h]
-  movaps      xmm6,[r11-18h]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm15
-  psubw       xmm9,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm15
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  pmaxsw      xmm11,xmm1
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm10,xmm0
-  pabsw       xmm0,xmm13
-  pminsw      xmm3,xmm11
-  movaps      xmm11,[r11-68h]
-  movaps      xmm13,[rsp+40h]
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm9
-  movaps      xmm9, [r11-48h]
-  pand        xmm10,xmm1
-  pcmpgtw     xmm8,xmm0
-  pand        xmm10,xmm8
-  pand        xmm10,xmm7
-  movaps      xmm8,[r11-38h]
-  movaps      xmm7,[r11-28h]
-  pand        xmm3,xmm10
-  paddw       xmm15,xmm3
-  psubw       xmm2,xmm3
-  movaps      xmm10,[r11-58h]
-  packuswb    xmm12,xmm15
-  movaps      xmm15,[rsp+20h]
-  packuswb    xmm14,xmm2
-  movq        [rcx],xmm12
-  movq        [rbx],xmm14
-  psrldq      xmm12,8
-  psrldq      xmm14,8
-  movq        [rdx],xmm12
-  movaps      xmm12,[r11-78h]
-  movq        [rdi],xmm14
-  movaps      xmm14,[rsp+30h]
-  mov         rsp,r11
-  POP_XMM
-  pop         rdi
-  pop         rbx
-  ret
-
-
-WELS_EXTERN   DeblockChromaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  PUSH_XMM 15
-  sub         rsp,90h
-  pxor        xmm1,xmm1
-  mov         r11,rcx
-  mov         rbx,rdx
-  mov         r10d,r9d
-  movq        xmm13,[r11]
-  lea         eax,[r8+r8]
-  movsxd      r9,eax
-  mov         rax,rcx
-  sub         rax,r9
-  movq        xmm14,[rax]
-  mov         rax,rdx
-  sub         rax,r9
-  movq        xmm0,[rax]
-  movsxd      rax,r8d
-  sub         rcx,rax
-  sub         rdx,rax
-  movq        xmm12,[rax+r11]
-  movq        xmm10,[rcx]
-  punpcklqdq  xmm14,xmm0
-  movdqa      xmm8,xmm14
-  movq        xmm0,[rdx]
-  punpcklbw   xmm8,xmm1
-  punpckhbw   xmm14,xmm1
-  punpcklqdq  xmm10,xmm0
-  movq        xmm0,[rbx]
-  movdqa      xmm5,xmm10
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rax+rbx]
-  punpcklbw   xmm5,xmm1
-  movsx       eax,r10w
-  movdqa      xmm9,xmm13
-  punpcklqdq  xmm12,xmm0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm10,xmm1
-  movd        xmm0,eax
-  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
-  punpckhbw   xmm13,xmm1
-  movdqa      xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm12,xmm1
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm7,xmm1
-  movd        xmm0,eax
-  movdqa      xmm1,xmm8
-  psubw       xmm1,xmm5
-  punpcklwd   xmm0,xmm0
-  movdqa      xmm6,xmm11
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm9
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm10
-  movdqa      xmm1,xmm14
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm10
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm11,xmm0
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm11,xmm2
-  movdqa      xmm0,xmm12
-  movdqa      xmm4,xmm6
-  movdqa      xmm1,xmm8
-  mov         eax,2
-  cwde
-  paddw       xmm1,xmm8
-  psubw       xmm0,xmm13
-  paddw       xmm1,xmm5
-  pabsw       xmm0,xmm0
-  movdqa      xmm2,xmm14
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm14
-  movd        xmm0,eax
-  pand        xmm11,xmm3
-  paddw       xmm7,xmm7
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  paddw       xmm2,xmm12
-  paddw       xmm12,xmm12
-  pshufd      xmm3,xmm0,0
-  paddw       xmm7,xmm9
-  paddw       xmm12,xmm13
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm5
-  paddw       xmm7,xmm8
-  psraw       xmm1,2
-  paddw       xmm12,xmm14
-  paddw       xmm7,xmm3
-  movaps      xmm14,[rsp]
-  pand        xmm4,xmm1
-  paddw       xmm12,xmm3
-  psraw       xmm7,2
-  movdqa      xmm1,xmm11
-  por         xmm4,xmm0
-  psraw       xmm12,2
-  paddw       xmm2,xmm3
-  movdqa      xmm0,xmm11
-  pandn       xmm0,xmm10
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  packuswb    xmm4,xmm1
-  movdqa      xmm0,xmm11
-  movdqa      xmm1,xmm6
-  pand        xmm1,xmm7
-  movaps      xmm7,[rsp+70h]
-  movq        [rcx],xmm4
-  pandn       xmm6,xmm9
-  pandn       xmm11,xmm13
-  pand        xmm0,xmm12
-  por         xmm1,xmm6
-  por         xmm0,xmm11
-  psrldq      xmm4,8
-  packuswb    xmm1,xmm0
-  movq        [r11],xmm1
-  psrldq      xmm1,8
-  movq        [rdx],xmm4
-  lea         r11,[rsp+90h]
-  movaps      xmm6,[r11-10h]
-  movaps      xmm8,[r11-30h]
-  movaps      xmm9,[r11-40h]
-  movq        [rbx],xmm1
-  movaps      xmm10,[r11-50h]
-  movaps      xmm11,[r11-60h]
-  movaps      xmm12,[r11-70h]
-  movaps      xmm13,[r11-80h]
-  mov         rsp,r11
-  POP_XMM
-  pop         rbx
-  ret
-
-
-
-
-
-WELS_EXTERN   DeblockChromaEq4H_ssse3
-  mov         rax,rsp
-  mov         [rax+20h],rbx
-  push        rdi
-  PUSH_XMM 16
-  sub         rsp,140h
-  mov         rdi,rdx
-  lea         eax,[r8*4]
-  movsxd      r10,eax
-  mov         eax,[rcx-2]
-  mov         [rsp+10h],eax
-  lea         rbx,[r10+rdx-2]
-  lea         r11,[r10+rcx-2]
-  movdqa      xmm5,[rsp+10h]
-  movsxd      r10,r8d
-  mov         eax,[r10+rcx-2]
-  lea         rdx,[r10+r10*2]
-  mov         [rsp+20h],eax
-  mov         eax,[rcx+r10*2-2]
-  mov         [rsp+30h],eax
-  mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h]
-  mov         [rsp+40h],eax
-  mov         eax, [rdi-2]
-  movdqa      xmm4,[rsp+30h]
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rdi-2]
-  movdqa      xmm3,[rsp+40h]
-  mov         [rsp+60h],eax
-  mov         eax,[rdi+r10*2-2]
-  punpckldq   xmm5,[rsp+50h]
-  mov         [rsp+70h],eax
-  mov         eax, [rdx+rdi-2]
-  punpckldq   xmm2, [rsp+60h]
-  mov          [rsp+80h],eax
-  mov         eax,[r11]
-  punpckldq   xmm4, [rsp+70h]
-  mov         [rsp+50h],eax
-  mov         eax,[rbx]
-  punpckldq   xmm3,[rsp+80h]
-  mov         [rsp+60h],eax
-  mov         eax,[r10+r11]
-  movdqa      xmm0, [rsp+50h]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm0,[rsp+50h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+60h],eax
-  mov         eax,[r11+r10*2]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[rbx+r10*2]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  mov         eax, [rdx+r11]
-  movdqa      xmm15,xmm1
-  punpckldq   xmm0,[rsp+60h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax, [rdx+rbx]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm15,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm12,xmm15
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm12,xmm0
-  punpckhdq   xmm15,xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm11,xmm12
-  punpckldq   xmm0,xmm5
-  punpckhdq   xmm1,xmm5
-  punpcklqdq  xmm11,xmm0
-  punpckhqdq  xmm12,xmm0
-  movsx       eax,r9w
-  movdqa      xmm14,xmm15
-  punpcklqdq  xmm14,xmm1
-  punpckhqdq  xmm15,xmm1
-  pxor        xmm1,xmm1
-  movd        xmm0,eax
-  movdqa      xmm4,xmm12
-  movdqa      xmm8,xmm11
-  movsx       eax,word [rsp+170h + 160] ; iBeta
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm4,xmm1
-  punpckhbw   xmm12,xmm1
-  movdqa      xmm9,xmm14
-  movdqa      xmm7,xmm15
-  movdqa      xmm10,xmm15
-  pshufd      xmm13,xmm0,0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm14,xmm1
-  movdqa      xmm6,xmm13
-  movd        xmm0,eax
-  movdqa      [rsp],xmm11
-  mov         eax,2
-  cwde
-  punpckhbw   xmm11,xmm1
-  punpckhbw   xmm10,xmm1
-  punpcklbw   xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm8,xmm1
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm4
-  psubw       xmm0,xmm9
-  psubw       xmm1,xmm4
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm12
-  movdqa      xmm1,xmm11
-  psubw       xmm0,xmm14
-  psubw       xmm1,xmm12
-  movdqa      xmm5,xmm6
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm13,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm2,xmm0
-  paddw       xmm1,xmm8
-  movdqa      xmm0,xmm10
-  pand        xmm13,xmm2
-  psubw       xmm0,xmm14
-  paddw       xmm1,xmm4
-  movdqa      xmm2,xmm11
-  pabsw       xmm0,xmm0
-  paddw       xmm2,xmm11
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm12
-  movd        xmm0,eax
-  pand        xmm13,xmm3
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm4
-  paddw       xmm2,xmm3
-  psraw       xmm1,2
-  pand        xmm5,xmm1
-  por         xmm5,xmm0
-  paddw       xmm7,xmm7
-  paddw       xmm10,xmm10
-  psraw       xmm2,2
-  movdqa      xmm1,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm2
-  paddw       xmm7,xmm9
-  por         xmm1,xmm0
-  paddw       xmm10,xmm14
-  paddw       xmm7,xmm8
-  movdqa      xmm0,xmm13
-  packuswb    xmm5,xmm1
-  paddw       xmm7,xmm3
-  paddw       xmm10,xmm11
-  movdqa      xmm1,xmm6
-  paddw       xmm10,xmm3
-  pandn       xmm6,xmm9
-  psraw       xmm7,2
-  pand        xmm1,xmm7
-  psraw       xmm10,2
-  pandn       xmm13,xmm14
-  pand        xmm0,xmm10
-  por         xmm1,xmm6
-  movdqa      xmm6,[rsp]
-  movdqa      xmm4,xmm6
-  por         xmm0,xmm13
-  punpcklbw   xmm4,xmm5
-  punpckhbw   xmm6,xmm5
-  movdqa      xmm3,xmm4
-  packuswb    xmm1,xmm0
-  movdqa      xmm0,xmm1
-  punpckhbw   xmm1,xmm15
-  punpcklbw   xmm0,xmm15
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm6
-  movdqa      xmm2,xmm3
-  punpcklwd   xmm0,xmm1
-  punpckhwd   xmm6,xmm1
-  movdqa      xmm1,xmm4
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm6
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm6
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+10h],xmm0
-  movdqa      [rsp+60h],xmm2
-  movdqa      xmm0,xmm3
-  mov         eax,[rsp+10h]
-  mov         [rcx-2],eax
-  mov         eax,[rsp+60h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [r10+rcx-2],eax
-  movdqa      [rsp+20h],xmm0
-  mov         eax, [rsp+20h]
-  movdqa      [rsp+70h],xmm3
-  mov         [rcx+r10*2-2],eax
-  mov         eax,[rsp+70h]
-  mov         [rdx+rcx-2],eax
-  mov         eax,[rsp+18h]
-  mov         [r11],eax
-  mov         eax,[rsp+68h]
-  mov         [r10+r11],eax
-  mov         eax,[rsp+28h]
-  mov         [r11+r10*2],eax
-  mov         eax,[rsp+78h]
-  mov         [rdx+r11],eax
-  mov         eax,[rsp+14h]
-  mov         [rdi-2],eax
-  mov         eax,[rsp+64h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+24h]
-  mov         [rdi+r10*2-2],eax
-  mov         eax, [rsp+74h]
-  mov         [rdx+rdi-2],eax
-  mov         eax, [rsp+1Ch]
-  mov         [rbx],eax
-  mov         eax, [rsp+6Ch]
-  mov         [r10+rbx],eax
-  mov         eax,[rsp+2Ch]
-  mov         [rbx+r10*2],eax
-  mov         eax,[rsp+7Ch]
-  mov         [rdx+rbx],eax
-  lea         rsp,[rsp+140h]
-  POP_XMM
-  mov         rbx, [rsp+28h]
-  pop         rdi
-  ret
-
-
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        rsi
-  push        rdi
-  push        r12
-  PUSH_XMM 16
-  sub         rsp,170h
-
-  movsxd      rsi,r8d
-  lea         eax,[r8*4]
-  mov         r11d,r9d
-  movsxd      r10,eax
-  mov         eax, [rcx-2]
-  mov         r12,rdx
-  mov         [rsp+40h],eax
-  mov         eax, [rsi+rcx-2]
-  lea         rbx,[r10+rcx-2]
-  movdqa      xmm5,[rsp+40h]
-  mov         [rsp+50h],eax
-  mov         eax, [rcx+rsi*2-2]
-  lea         rbp,[r10+rdx-2]
-  movdqa      xmm2, [rsp+50h]
-  mov         [rsp+60h],eax
-  lea         r10,[rsi+rsi*2]
-  mov         rdi,rcx
-  mov         eax,[r10+rcx-2]
-  movdqa      xmm4,[rsp+60h]
-  mov         [rsp+70h],eax
-  mov         eax,[rdx-2]
-  mov         [rsp+80h],eax
-  mov         eax, [rsi+rdx-2]
-  movdqa      xmm3,[rsp+70h]
-  mov         [rsp+90h],eax
-  mov         eax,[rdx+rsi*2-2]
-  punpckldq   xmm5,[rsp+80h]
-  mov         [rsp+0A0h],eax
-  mov         eax, [r10+rdx-2]
-  punpckldq   xmm2,[rsp+90h]
-  mov         [rsp+0B0h],eax
-  mov         eax, [rbx]
-  punpckldq   xmm4,[rsp+0A0h]
-  mov         [rsp+80h],eax
-  mov         eax,[rbp]
-  punpckldq   xmm3,[rsp+0B0h]
-  mov         [rsp+90h],eax
-  mov         eax,[rsi+rbx]
-  movdqa      xmm0,[rsp+80h]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rsi+rbp]
-  movdqa      xmm0,[rsp+80h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+90h],eax
-  mov         eax,[rbx+rsi*2]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rbp+rsi*2]
-  movdqa      xmm0, [rsp+80h]
-  mov         [rsp+90h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm7,xmm1
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax, [r10+rbp]
-  movdqa      xmm0,[rsp+80h]
-  mov         [rsp+90h],eax
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm7,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm6,xmm7
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm6,xmm0
-  punpckhdq   xmm7,xmm0
-  movdqa      xmm0,xmm1
-  punpckldq   xmm0,xmm5
-  mov         rax, [rsp+1C8h+160]    ; pTC
-  punpckhdq   xmm1,xmm5
-  movdqa      xmm9,xmm6
-  punpckhqdq  xmm6,xmm0
-  punpcklqdq  xmm9,xmm0
-  movdqa      xmm2,xmm7
-  movdqa      xmm13,xmm6
-  movdqa      xmm4,xmm9
-  movdqa      [rsp+10h],xmm9
-  punpcklqdq  xmm2,xmm1
-  punpckhqdq  xmm7,xmm1
-  pxor        xmm1,xmm1
-  movsx       ecx,byte [rax+3]
-  movsx       edx,byte [rax+2]
-  movsx       r8d,byte [rax+1]
-  movsx       r9d,byte [rax]
-  movdqa      xmm10,xmm1
-  movdqa      xmm15,xmm2
-  punpckhbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm4,xmm1
-  movsx       eax,r11w
-  mov         word [rsp+0Eh],cx
-  mov         word [rsp+0Ch],cx
-  movdqa      xmm3,xmm7
-  movdqa      xmm8,xmm7
-  movdqa      [rsp+20h],xmm7
-  punpcklbw   xmm15,xmm1
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm3,xmm1
-  mov         word [rsp+0Ah],dx
-  mov         word [rsp+8],dx
-  mov         word [rsp+6],r8w
-  movd        xmm0,eax
-  movdqa      [rsp+30h],xmm6
-  punpckhbw   xmm9,xmm1
-  punpckhbw   xmm8,xmm1
-  punpcklwd   xmm0,xmm0
-  movsx       eax,word [rsp+1C0h+160]   ; iBeta
-  mov         word [rsp+4],r8w
-  mov         word [rsp+2],r9w
-  pshufd      xmm12,xmm0,0
-  mov         word [rsp],r9w
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  movdqa      xmm14, [rsp]
-  movdqa      [rsp],xmm2
-  movdqa      xmm2,xmm12
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  psubw       xmm10,xmm14
-  movd        xmm0,eax
-  movdqa      xmm7,xmm14
-  movdqa      xmm6,xmm14
-  pcmpgtw     xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  movdqa      xmm0,xmm4
-  movdqa      xmm1,xmm15
-  psubw       xmm4,xmm13
-  psubw       xmm0,xmm3
-  psubw       xmm1,xmm13
-  psubw       xmm3,xmm15
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm10
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm11
-  movdqa      xmm0,xmm13
-  psubw       xmm0,xmm15
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm11
-  movdqa      xmm3,[rsp+30h]
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm9
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm8
-  psubw       xmm9,xmm3
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  psubw       xmm15,xmm6
-  paddw       xmm13,xmm6
-  movdqa      xmm2,[rsp]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  psubw       xmm8,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm3
-  movdqa      xmm5,[rsp+10h]
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  movdqa      xmm4,xmm5
-  pabsw       xmm0,xmm0
-  pmaxsw      xmm10,xmm1
-  movdqa      xmm1,xmm11
-  pcmpgtw     xmm12,xmm0
-  pabsw       xmm0,xmm9
-  pminsw      xmm14,xmm10
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm8
-  pcmpgtw     xmm11,xmm0
-  pand        xmm12,xmm1
-  movdqa      xmm1,[rsp+20h]
-  pand        xmm12,xmm11
-  pand        xmm12,xmm7
-  pand        xmm14,xmm12
-  paddw       xmm3,xmm14
-  psubw       xmm2,xmm14
-  packuswb    xmm13,xmm3
-  packuswb    xmm15,xmm2
-  punpcklbw   xmm4,xmm13
-  punpckhbw   xmm5,xmm13
-  movdqa      xmm0,xmm15
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm4
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm2,xmm3
-  movdqa      xmm1,xmm4
-  punpcklwd   xmm0,xmm15
-  punpckhwd   xmm5,xmm15
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm5
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm5
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+40h],xmm0
-  movdqa      xmm0,xmm3
-  movdqa      [rsp+90h],xmm2
-  mov         eax,[rsp+40h]
-  mov         [rdi-2],eax
-  mov         eax, [rsp+90h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [rsi+rdi-2],eax
-  movdqa      [rsp+50h],xmm0
-  mov         eax,[rsp+50h]
-  movdqa      [rsp+0A0h],xmm3
-  mov         [rdi+rsi*2-2],eax
-  mov         eax,[rsp+0A0h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+48h]
-  mov         [rbx],eax
-  mov         eax,[rsp+98h]
-  mov         [rsi+rbx],eax
-  mov         eax,[rsp+58h]
-  mov         [rbx+rsi*2],eax
-  mov         eax, [rsp+0A8h]
-  mov         [r10+rbx],eax
-  mov         eax, [rsp+44h]
-  mov         [r12-2],eax
-  mov         eax,[rsp+94h]
-  mov         [rsi+r12-2],eax
-  mov         eax,[rsp+54h]
-  mov         [r12+rsi*2-2],eax
-  mov         eax, [rsp+0A4h]
-  mov         [r10+r12-2],eax
-  mov         eax,[rsp+4Ch]
-  mov         [rbp],eax
-  mov         eax,[rsp+9Ch]
-  mov         [rsi+rbp],eax
-  mov         eax, [rsp+5Ch]
-  mov         [rbp+rsi*2],eax
-  mov         eax,[rsp+0ACh]
-  mov         [r10+rbp],eax
-  lea         r11,[rsp+170h]
-  mov         rsp,r11
-  POP_XMM
-  pop         r12
-  pop         rdi
-  pop         rsi
-  pop         rbp
-  pop         rbx
-  ret
-
-
-
-%elifdef  UNIX64
-
-
-WELS_EXTERN   DeblockLumaLt4V_ssse3
-  push        rbp
-  mov         r11,r8  ; pTC
-  sub         rsp,1B0h
-  lea         rbp,[rsp+20h]
-  movd        xmm4,edx
-  movd        xmm2,ecx
-  mov         qword [rbp+180h],r12
-  mov         r10,rdi
-  movsxd      r12,esi
-  add         rsi,rsi
-  movsxd      rdx,esi
-  sub         r10,r12
-  movsx       r8d,byte [r11]
-  pxor        xmm3,xmm3
-  punpcklwd   xmm2,xmm2
-  movaps      [rbp+50h],xmm14
-  lea         rax,[r12+r12*2]
-  movdqa      xmm14,[rdx+rdi]
-  neg         rax
-  pshufd      xmm0,xmm2,0
-  movd        xmm2,r8d
-  movsx       rsi,byte [r11+1]
-  movsx       r8d,byte [r11+2]
-  movsx       r11d,byte [r11+3]
-  movaps      [rbp+70h],xmm12
-  movd        xmm1,esi
-  movaps      [rbp+80h],xmm11
-  movd        xmm12,r8d
-  movd        xmm11,r11d
-  movdqa      xmm5, [rax+rdi]
-  lea         rax,[r12+r12]
-  punpcklwd   xmm12,xmm12
-  neg         rax
-  punpcklwd   xmm11,xmm11
-  movaps      [rbp],xmm8
-  movdqa      xmm8, [r10]
-  punpcklwd   xmm2,xmm2
-  punpcklwd   xmm1,xmm1
-  punpcklqdq  xmm12,xmm12
-  punpcklqdq  xmm11,xmm11
-  punpcklqdq  xmm2,xmm2
-  punpcklqdq  xmm1,xmm1
-  shufps      xmm12,xmm11,88h
-  movdqa      xmm11,xmm8
-  movaps      [rbp+30h],xmm9
-  movdqa      xmm9,[rdi]
-  shufps      xmm2,xmm1,88h
-  movdqa      xmm1,xmm5
-  punpcklbw   xmm11,xmm3
-  movaps      [rbp+20h],xmm6
-  movaps      [rbp+60h],xmm13
-  movdqa      xmm13,xmm11
-  movaps      [rbp+90h],xmm10
-  movdqa      xmm10,xmm9
-  movdqa      xmm6,[rax+rdi]
-  punpcklbw   xmm1,xmm3
-  movaps      [rbp+0A0h],xmm12
-  psubw       xmm13,xmm1
-  movaps      [rbp+40h],xmm15
-  movdqa      xmm15,xmm14
-  movaps      [rbp+10h],xmm7
-  movdqa      xmm7,xmm6
-  punpcklbw   xmm10,xmm3
-  movdqa      xmm12,[r12+rdi]
-  punpcklbw   xmm7,xmm3
-  punpcklbw   xmm12,xmm3
-  punpcklbw   xmm15,xmm3
-  pabsw       xmm3,xmm13
-  movdqa      xmm13,xmm10
-  psubw       xmm13,xmm15
-  movdqa      [rbp+0F0h],xmm15
-  pabsw       xmm15,xmm13
-  movdqa      xmm13,xmm11
-  movdqa      [rbp+0B0h],xmm1
-  movdqa      xmm1,xmm0
-  pavgw       xmm13,xmm10
-  pcmpgtw     xmm1,xmm3
-  movdqa      [rbp+120h],xmm13
-  movaps      xmm13,xmm2
-  punpcklwd   xmm4,xmm4
-  movdqa      xmm3,xmm0
-  movdqa      [rbp+100h],xmm1
-  psubw       xmm13,xmm1
-  movdqa      xmm1,xmm10
-  pcmpgtw     xmm3,xmm15
-  pshufd      xmm4,xmm4,0
-  psubw       xmm1,xmm11
-  movdqa      [rbp+0D0h],xmm10
-  psubw       xmm13,xmm3
-  movdqa      [rbp+110h],xmm3
-  pabsw       xmm15,xmm1
-  movdqa      xmm3,xmm4
-  psubw       xmm10,xmm12
-  pcmpgtw     xmm3,xmm15
-  pabsw       xmm15,xmm10
-  movdqa      xmm10,xmm0
-  psllw       xmm1,2
-  movdqa      [rbp+0C0h],xmm11
-  psubw       xmm11,xmm7
-  pcmpgtw     xmm10,xmm15
-  pabsw       xmm11,xmm11
-  movdqa      xmm15,xmm0
-  pand        xmm3,xmm10
-  pcmpgtw     xmm15,xmm11
-  movaps      xmm11,xmm2
-  pxor        xmm10,xmm10
-  pand        xmm3,xmm15
-  pcmpgtw     xmm11,xmm10
-  pcmpeqw     xmm10,xmm2
-  por         xmm11,xmm10
-  pand        xmm3,xmm11
-  movdqa      xmm11,xmm7
-  psubw       xmm11,xmm12
-  pxor        xmm15,xmm15
-  paddw       xmm11,xmm1
-  psubw       xmm15,xmm13
-  movdqa      [rbp+0E0h],xmm12
-  paddw       xmm11,[FOUR_16B_SSE2]
-  pxor        xmm12,xmm12
-  psraw       xmm11,3
-  punpckhbw   xmm8,xmm12
-  pmaxsw      xmm15,xmm11
-  punpckhbw   xmm5,xmm12
-  movdqa      xmm11,xmm8
-  pminsw      xmm13,xmm15
-  psubw       xmm11,xmm5
-  punpckhbw   xmm9,xmm12
-  pand        xmm13,xmm3
-  movdqa      [rbp+130h],xmm13
-  pabsw       xmm13,xmm11
-  punpckhbw   xmm14,xmm12
-  movdqa      xmm11,xmm9
-  psubw       xmm11,xmm14
-  movdqa      xmm15,xmm0
-  movdqa      [rbp+140h],xmm14
-  pabsw       xmm14,xmm11
-  movdqa      xmm11,xmm8
-  pcmpgtw     xmm15,xmm14
-  movdqa      xmm1,[r12+rdi]
-  pavgw       xmm11,xmm9
-  movdqa      [rbp+170h],xmm11
-  movdqa      xmm10,xmm9
-  punpckhbw   xmm6,xmm12
-  psubw       xmm10,xmm8
-  punpckhbw   xmm1,xmm12
-  movdqa      xmm12,xmm0
-  movaps      xmm11,[rbp+0A0h]
-  pcmpgtw     xmm12,xmm13
-  movaps      xmm13,xmm11
-  psubw       xmm13,xmm12
-  movdqa      [rbp+160h],xmm15
-  psubw       xmm13,xmm15
-  movdqa      xmm15,xmm9
-  psubw       xmm15,xmm1
-  movdqa      [rbp+150h],xmm12
-  pabsw       xmm12,xmm10
-  pabsw       xmm14,xmm15
-  movdqa      xmm15,xmm8
-  pcmpgtw     xmm4,xmm12
-  movdqa      xmm12,xmm0
-  psubw       xmm15,xmm6
-  pcmpgtw     xmm12,xmm14
-  pabsw       xmm14,xmm15
-  psllw       xmm10,2
-  pcmpgtw     xmm0,xmm14
-  movdqa      xmm14,xmm6
-  psubw       xmm14,xmm1
-  pand        xmm4,xmm12
-  paddw       xmm14,xmm10
-  pand        xmm4,xmm0
-  paddw       xmm14,[FOUR_16B_SSE2]
-  pxor        xmm15,xmm15
-  movaps      xmm12,xmm11
-  psubw       xmm15,xmm13
-  pxor        xmm0,xmm0
-  psraw       xmm14,3
-  pcmpgtw     xmm12,xmm0
-  pcmpeqw     xmm0,xmm11
-  pmaxsw      xmm15,xmm14
-  por         xmm12,xmm0
-  movdqa      xmm0,[rbp+120h]
-  pminsw      xmm13,xmm15
-  movdqa      xmm15,[rbp+0B0h]
-  movdqa      xmm10,xmm7
-  pand        xmm4,xmm12
-  paddw       xmm15,xmm0
-  pxor        xmm12,xmm12
-  paddw       xmm10,xmm7
-  movdqa      xmm14,xmm12
-  psubw       xmm15,xmm10
-  psubw       xmm14,xmm2
-  psraw       xmm15,1
-  pmaxsw      xmm15,xmm14
-  movdqa      xmm10,xmm6
-  pminsw      xmm15,xmm2
-  paddw       xmm10,xmm6
-  pand        xmm15,xmm3
-  psubw       xmm12,xmm11
-  pand        xmm15,[rbp+100h]
-  pand        xmm13,xmm4
-  paddw       xmm7,xmm15
-  paddw       xmm8,xmm13
-  movdqa      xmm15,[rbp+170h]
-  psubw       xmm9,xmm13
-  paddw       xmm5,xmm15
-  psubw       xmm5,xmm10
-  psraw       xmm5,1
-  pmaxsw      xmm5,xmm12
-  pminsw      xmm5,xmm11
-  pand        xmm5,xmm4
-  pand        xmm5,[rbp+150h]
-  paddw       xmm6,xmm5
-  movdqa      xmm5,[rbp+0C0h]
-  packuswb    xmm7,xmm6
-  movdqa      xmm6,[rbp+130h]
-  paddw       xmm5,xmm6
-  packuswb    xmm5,xmm8
-  movdqa      xmm8,[rbp+0D0h]
-  psubw       xmm8,xmm6
-  movdqa      xmm6,[rbp+0F0h]
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[rbp+0E0h]
-  packuswb    xmm8,xmm9
-  movdqa      xmm9,xmm0
-  paddw       xmm9,xmm0
-  psubw       xmm6,xmm9
-  psraw       xmm6,1
-  pmaxsw      xmm14,xmm6
-  pminsw      xmm2,xmm14
-  pand        xmm2,xmm3
-  pand        xmm2,[rbp+110h]
-  paddw       xmm0,xmm2
-  movdqa      xmm2,[rbp+140h]
-  paddw       xmm2,xmm15
-  movdqa      xmm15,xmm1
-  paddw       xmm15,xmm1
-  psubw       xmm2,xmm15
-  psraw       xmm2,1
-  pmaxsw      xmm12,xmm2
-  pminsw      xmm11,xmm12
-  pand        xmm11,xmm4
-  pand        xmm11,[rbp+160h]
-  paddw       xmm1,xmm11
-  movdqa      [rax+rdi],xmm7
-  movdqa      [r10],xmm5
-  packuswb    xmm0,xmm1
-  movdqa      [rdi],xmm8
-  movdqa      [r12+rdi],xmm0
-  mov         r12,qword [rbp+180h]
-  lea         rsp,[rbp+190h]
-  pop         rbp
-  ret
-
-
-WELS_EXTERN DeblockLumaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  mov         r8,   rdx
-  mov         r9,   rcx
-  mov         rcx,  rdi
-  mov         rdx,  rsi
-  sub         rsp,1D8h
-  movaps      [rax-38h],xmm6
-  movaps      [rax-48h],xmm7
-  movaps      [rax-58h],xmm8
-  pxor        xmm1,xmm1
-  movsxd      r10,edx
-  mov         rbp,rcx
-  mov         r11d,r8d
-  mov         rdx,rcx
-  mov         rdi,rbp
-  mov         rbx,rbp
-  movdqa      xmm5,[rbp]
-  movaps      [rax-68h],xmm9
-  movaps      [rax-78h],xmm10
-  punpcklbw   xmm5,xmm1
-  movaps      [rax-88h],xmm11
-  movaps      [rax-98h],xmm12
-  movaps      [rax-0A8h],xmm13
-  movaps      [rax-0B8h],xmm14
-  movdqa      xmm14,[r10+rbp]
-  movaps      [rax-0C8h],xmm15
-  lea         eax,[r10*4]
-  movsxd      r8,eax
-  lea         eax,[r10+r10*2]
-  movsxd      rcx,eax
-  lea         eax,[r10+r10]
-  sub         rdx,r8
-  punpcklbw   xmm14,xmm1
-  movdqa      [rsp+90h],xmm5
-  movdqa      [rsp+30h],xmm14
-  movsxd      rsi,eax
-  movsx       eax,r11w
-  sub         rdi,rcx
-  sub         rbx,rsi
-  mov         r8,rbp
-  sub         r8,r10
-  movd        xmm0,eax
-  movsx       eax,r9w
-  movdqa      xmm12,[rdi]
-  movdqa      xmm6, [rsi+rbp]
-  movdqa      xmm13,[rbx]
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm6,xmm1
-  movdqa      xmm8,[r8]
-  movd        xmm0,eax
-  movdqa      xmm10,xmm11
-  mov         eax,2
-  punpcklbw   xmm8,xmm1
-  punpcklbw   xmm12,xmm1
-  cwde
-  punpcklwd   xmm0,xmm0
-  psraw       xmm10,2
-  movdqa      xmm1,xmm8
-  movdqa      [rsp+0F0h],xmm13
-  movdqa      [rsp+0B0h],xmm8
-  pshufd      xmm7,xmm0,0
-  psubw       xmm1,xmm13
-  movdqa      xmm0,xmm5
-  movdqa      xmm4,xmm7
-  movdqa      xmm2,xmm7
-  psubw       xmm0,xmm8
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm5
-  movdqa      [rsp+40h],xmm7
-  movdqa      [rsp+60h],xmm6
-  pcmpgtw     xmm4,xmm0
-  psubw       xmm1,xmm14
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm4,xmm2
-  movdqa      xmm0,xmm11
-  pcmpgtw     xmm0,xmm3
-  pand        xmm4,xmm0
-  movd        xmm0,eax
-  movdqa      [rsp+20h],xmm4
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm2,xmm0,0
-  paddw       xmm10,xmm2
-  movdqa      [rsp+0A0h],xmm2
-  movdqa      xmm15,xmm7
-  pxor        xmm4,xmm4
-  movdqa      xmm0,xmm8
-  psubw       xmm0,xmm12
-  mov         eax,4
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm10
-  cwde
-  pcmpgtw     xmm15,xmm0
-  pcmpgtw     xmm1,xmm3
-  movdqa      xmm3,xmm7
-  movdqa      xmm7,[rdx]
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm6
-  pand        xmm15,xmm1
-  punpcklbw   xmm7,xmm4
-  movdqa      xmm9,xmm15
-  pabsw       xmm0,xmm0
-  psllw       xmm7,1
-  pandn       xmm9,xmm12
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm7,xmm12
-  movd        xmm0,eax
-  pand        xmm3,xmm1
-  paddw       xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  paddw       xmm7,xmm12
-  pshufd      xmm1,xmm0,0
-  paddw       xmm7,xmm13
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm6
-  paddw       xmm7,xmm8
-  movdqa      [rsp+70h],xmm1
-  paddw       xmm7,xmm5
-  movdqa      [rsp+120h],xmm0
-  movdqa      xmm0,[rcx+rbp]
-  punpcklbw   xmm0,xmm4
-  paddw       xmm7,xmm1
-  movdqa      xmm4,xmm15
-  psllw       xmm0,1
-  psraw       xmm7,3
-  paddw       xmm0,xmm6
-  pand        xmm7,xmm15
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm6
-  paddw       xmm0,xmm14
-  movdqa      xmm6,xmm15
-  paddw       xmm0,xmm5
-  pandn       xmm6,xmm13
-  paddw       xmm0,xmm8
-  paddw       xmm0,xmm1
-  psraw       xmm0,3
-  movdqa      xmm1,xmm12
-  paddw       xmm1,xmm13
-  pand        xmm0,xmm3
-  movdqa      [rsp+100h],xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,xmm5
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm3
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pandn       xmm0,xmm14
-  pand        xmm4,xmm1
-  movdqa      [rsp+0E0h],xmm0
-  movdqa      xmm0,xmm5
-  paddw       xmm0,xmm8
-  movdqa      xmm1,[rsp+60h]
-  paddw       xmm1,xmm14
-  movdqa      xmm14,xmm3
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm8
-  paddw       xmm0,[rsp+30h]
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  pand        xmm14,xmm1
-  movdqa      xmm1,xmm13
-  paddw       xmm1,xmm13
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm2
-  psraw       xmm1,2
-  movdqa      xmm0,[rsp+30h]
-  movdqa      xmm2,xmm13
-  movdqa      xmm5,xmm15
-  paddw       xmm0,[rsp+70h]
-  pandn       xmm5,xmm1
-  paddw       xmm2,xmm8
-  movdqa      xmm8,[rsp+90h]
-  movdqa      xmm1,xmm12
-  paddw       xmm2,xmm8
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,xmm8
-  movdqa      xmm8,xmm3
-  movdqa      xmm2,[rsp+30h]
-  paddw       xmm0,xmm13
-  psraw       xmm1,3
-  pand        xmm15,xmm1
-  movdqa      xmm1,xmm2
-  paddw       xmm1,xmm2
-  paddw       xmm2,[rsp+90h]
-  paddw       xmm2,[rsp+0B0h]
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  movdqa      xmm13,[r8]
-  paddw       xmm0, [rsp+70h]
-  paddw       xmm1, [rsp+0A0h]
-  psllw       xmm2,1
-  paddw       xmm2,xmm0
-  psraw       xmm1,2
-  movdqa      xmm0, [rdi]
-  pandn       xmm8,xmm1
-  movdqa      xmm1, [rsp+60h]
-  paddw       xmm1,xmm2
-  movdqa      xmm2, [rbx]
-  psraw       xmm1,3
-  pand        xmm3,xmm1
-  movdqa      xmm1, [rbp]
-  movdqa      [rsp+0D0h],xmm3
-  pxor        xmm3,xmm3
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm1,xmm3
-  punpckhbw   xmm13,xmm3
-  movdqa      [rsp+0C0h],xmm0
-  movdqa      xmm0,[r10+rbp]
-  movdqa      [rsp],xmm1
-  punpckhbw   xmm0,xmm3
-  punpckhbw   xmm2,xmm3
-  movdqa      [rsp+80h],xmm0
-  movdqa      xmm0,[rsi+rbp]
-  movdqa      [rsp+10h],xmm13
-  punpckhbw   xmm0,xmm3
-  movdqa      [rsp+50h],xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm1,xmm13
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm2
-  pabsw       xmm3,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,[rsp]
-  movdqa      xmm13,[rsp+40h]
-  movdqa      [rsp+110h],xmm2
-  psubw       xmm1, [rsp+80h]
-  pcmpgtw     xmm13,xmm0
-  pcmpgtw     xmm11,xmm3
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm10,xmm3
-  movdqa      xmm1, [rsp+40h]
-  movdqa      xmm2,xmm1
-  movdqa      xmm3,xmm1
-  pcmpgtw     xmm2,xmm0
-  movdqa      xmm0, [rsp+10h]
-  pand        xmm13,xmm2
-  pand        xmm13,xmm11
-  movdqa      xmm11,[rsp+0C0h]
-  psubw       xmm0,xmm11
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm3,xmm0
-  pand        xmm3,xmm10
-  movdqa      xmm0,[rsp]
-  psubw       xmm0,[rsp+50h]
-  movdqa      xmm2,[rdx]
-  pabsw       xmm0,xmm0
-  por         xmm7,xmm9
-  movdqa      xmm9,[rsp+20h]
-  pcmpgtw     xmm1,xmm0
-  pand        xmm9,xmm7
-  movdqa      xmm7,[rsp+20h]
-  movdqa      xmm0,xmm7
-  pandn       xmm0,xmm12
-  movdqa      xmm12,[rsp+110h]
-  pand        xmm1,xmm10
-  movdqa      xmm10,[rsp+70h]
-  movdqa      [rsp+40h],xmm1
-  movdqa      xmm1,xmm13
-  por         xmm9,xmm0
-  pxor        xmm0,xmm0
-  por         xmm4,xmm6
-  movdqa      xmm6,xmm7
-  punpckhbw   xmm2,xmm0
-  por         xmm15,xmm5
-  movdqa      xmm5,[rsp+20h]
-  movdqa      xmm0,xmm3
-  psllw       xmm2,1
-  pandn       xmm0,xmm11
-  pand        xmm6,xmm4
-  movdqa      xmm4,[rsp]
-  paddw       xmm2,xmm11
-  pand        xmm5,xmm15
-  movdqa      xmm15,[rsp+20h]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm12
-  paddw       xmm2,[rsp+10h]
-  paddw       xmm2,[rsp]
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  pand        xmm2,xmm3
-  por         xmm2,xmm0
-  pand        xmm1,xmm2
-  movdqa      xmm0,xmm13
-  movdqa      xmm2,xmm11
-  pandn       xmm0,xmm11
-  paddw       xmm2,xmm12
-  por         xmm1,xmm0
-  packuswb    xmm9,xmm1
-  movdqa      xmm0,xmm7
-  movdqa      xmm7,[rsp+0A0h]
-  pandn       xmm0,[rsp+0F0h]
-  movdqa      xmm1,xmm3
-  por         xmm6,xmm0
-  movdqa      xmm0,[rsp+10h]
-  paddw       xmm0,xmm4
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm3
-  pandn       xmm0,xmm12
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  pandn       xmm0,xmm12
-  movdqa      xmm1,xmm12
-  paddw       xmm1,[rsp+10h]
-  por         xmm2,xmm0
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+0B0h]
-  paddw       xmm1,xmm4
-  packuswb    xmm6,xmm2
-  movdqa      xmm2,xmm3
-  psllw       xmm1,1
-  por         xmm5,xmm0
-  movdqa      xmm0,[rsp+80h]
-  paddw       xmm0,xmm10
-  paddw       xmm1,xmm0
-  paddw       xmm11,xmm1
-  psraw       xmm11,3
-  movdqa      xmm1,xmm12
-  pand        xmm2,xmm11
-  paddw       xmm1,xmm12
-  movdqa      xmm11,[rsp+80h]
-  movdqa      xmm0, [rsp+10h]
-  por         xmm14,[rsp+0E0h]
-  paddw       xmm0,xmm11
-  movdqa      xmm4,xmm15
-  paddw       xmm1,xmm0
-  movdqa      xmm0,xmm13
-  paddw       xmm1,xmm7
-  psraw       xmm1,2
-  pandn       xmm3,xmm1
-  por         xmm2,xmm3
-  movdqa      xmm1,xmm13
-  movdqa      xmm3,[rsp+10h]
-  pandn       xmm0,xmm3
-  pand        xmm1,xmm2
-  movdqa      xmm2,xmm11
-  paddw       xmm2,[rsp]
-  por         xmm1,xmm0
-  movdqa      xmm0,[rsp+0D0h]
-  por         xmm0,xmm8
-  paddw       xmm2,xmm3
-  packuswb    xmm5,xmm1
-  movdqa      xmm8,[rsp+40h]
-  movdqa      xmm1,[rsp+50h]
-  movdqa      xmm3,xmm8
-  pand        xmm4,xmm0
-  psllw       xmm2,1
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+90h]
-  por         xmm4,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm10
-  paddw       xmm2,xmm0
-  paddw       xmm1,xmm2
-  movdqa      xmm0,[rsp]
-  movdqa      xmm2,xmm11
-  paddw       xmm0,xmm12
-  movdqa      xmm12,[rsp]
-  paddw       xmm2,xmm11
-  paddw       xmm2,xmm0
-  psraw       xmm1,3
-  movdqa      xmm0,xmm8
-  pand        xmm3,xmm1
-  paddw       xmm2,xmm7
-  movdqa      xmm1,xmm13
-  psraw       xmm2,2
-  pandn       xmm0,xmm2
-  por         xmm3,xmm0
-  movdqa      xmm2,[rsp+50h]
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm3
-  paddw       xmm2,xmm11
-  movdqa      xmm3,xmm15
-  por         xmm1,xmm0
-  pand        xmm3,xmm14
-  movdqa      xmm14,[rsp+10h]
-  movdqa      xmm0,xmm15
-  pandn       xmm0,[rsp+30h]
-  packuswb    xmm4,xmm1
-  movdqa      xmm1,xmm8
-  por         xmm3,xmm0
-  movdqa      xmm0,xmm12
-  paddw       xmm0,xmm14
-  paddw       xmm2,xmm0
-  paddw       xmm2,xmm7
-  movdqa      xmm0,xmm8
-  pandn       xmm0,xmm11
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  movdqa      xmm2,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm11
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm15
-  por         xmm2,xmm0
-  packuswb    xmm3,xmm2
-  movdqa      xmm0,[rsp+100h]
-  por         xmm0,[rsp+120h]
-  pand        xmm1,xmm0
-  movdqa      xmm2,[rcx+rbp]
-  movdqa      xmm7,[rsp+50h]
-  pandn       xmm15,[rsp+60h]
-  lea         r11,[rsp+1D8h]
-  pxor        xmm0,xmm0
-  por         xmm1,xmm15
-  movaps      xmm15,[r11-0A8h]
-  movdqa      [rdi],xmm9
-  movaps      xmm9,[r11-48h]
-  punpckhbw   xmm2,xmm0
-  psllw       xmm2,1
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm7
-  movdqa      [rbx],xmm6
-  movaps      xmm6,[r11-18h]
-  paddw       xmm2,xmm7
-  paddw       xmm2,xmm11
-  movaps      xmm11,[r11-68h]
-  paddw       xmm2,xmm12
-  movaps      xmm12,[r11-78h]
-  paddw       xmm2,xmm14
-  paddw       xmm2,xmm10
-  psraw       xmm2,3
-  movaps      xmm10,[r11-58h]
-  movaps      xmm14,[r11-98h]
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm8
-  pandn       xmm8,xmm7
-  pandn       xmm13,xmm7
-  por         xmm2,xmm8
-  movaps      xmm7,[r11-28h]
-  movaps      xmm8,[r11-38h]
-  movdqa      [r8],xmm5
-  pand        xmm0,xmm2
-  por         xmm0,xmm13
-  packuswb    xmm1,xmm0
-  movaps      xmm13,[r11-88h]
-  movdqa      [rbp],xmm4
-  movdqa      [r10+rbp],xmm3
-  movdqa      [rsi+rbp],xmm1
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
-
-WELS_EXTERN  DeblockChromaLt4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  mov         r10,  rdx
-  mov         r11,  rcx
-  mov         rcx,  rdi
-  mov         rdx,  rsi
-  mov         rsi,  r10
-  mov         r10,  r9
-  mov         rbp,  r8
-  mov         r8,   rsi
-  mov         r9,   r11
-  sub         rsp,0C8h
-  pxor        xmm1,xmm1
-  mov         rbx,rcx
-  movsxd      r11,r8d
-  movsx       ecx,byte [r10]
-  movsx       r8d,byte [r10+2]
-  mov         rdi,rdx
-  movq        xmm2,[rbx]
-  movq        xmm9,[r11+rbx]
-  movsx       edx,byte [r10+1]
-  mov         word [rsp+2],cx
-  mov         word [rsp],cx
-  movsx       eax,byte [r10+3]
-  mov         word [rsp+6],dx
-  mov         word [rsp+4],dx
-  movdqa      xmm11,xmm1
-  mov         word [rsp+0Eh],ax
-  mov         word [rsp+0Ch],ax
-  lea         eax,[r11+r11]
-  movsxd      rcx,eax
-  mov         rax,rbx
-  mov         rdx,rdi
-  sub         rax,rcx
-  mov         word [rsp+0Ah],r8w
-  mov         word [rsp+8],r8w
-  movdqa      xmm6,[rsp]
-  movdqa      xmm7,xmm6
-  movq        xmm13, [rax]
-  mov         rax,rdi
-  sub         rax,rcx
-  mov         rcx,rbx
-  pcmpgtw     xmm7,xmm1
-  psubw       xmm11,xmm6
-  sub         rcx,r11
-  sub         rdx,r11
-  movq        xmm0,[rax]
-  movsx       eax,r9w
-  movq        xmm15,[rcx]
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rdx]
-  movdqa      xmm4,xmm13
-  punpcklqdq  xmm15,xmm0
-  movq        xmm0, [rdi]
-  punpcklbw   xmm4,xmm1
-  movdqa      xmm12,xmm15
-  punpcklqdq  xmm2,xmm0
-  movq        xmm0, [r11+rdi]
-  punpcklbw   xmm12,xmm1
-  movdqa      xmm14,xmm2
-  punpcklqdq  xmm9,xmm0
-  punpckhbw   xmm2,xmm1
-  punpcklbw   xmm14,xmm1
-  movd        xmm0,eax
-  mov         eax, ebp ; iBeta
-  punpckhbw   xmm13,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm9
-  movdqa      [rsp+10h],xmm2
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm9,xmm1
-  punpcklbw   xmm3,xmm1
-  movdqa      xmm1,xmm14
-  pshufd      xmm10,xmm0,0
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm8,xmm0,0
-  movd        xmm0,eax
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  psubw       xmm1,xmm12
-  movdqa      xmm2,xmm10
-  lea         r11,[rsp+0C8h]
-  psllw       xmm1,2
-  movdqa      xmm0,xmm4
-  psubw       xmm4,xmm12
-  psubw       xmm0,xmm3
-  psubw       xmm3,xmm14
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm11
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm12
-  psubw       xmm0,xmm14
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  movdqa      xmm3,[rsp]
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm13
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm9
-  psubw       xmm13,xmm15
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  paddw       xmm12,xmm6
-  psubw       xmm14,xmm6
-  movdqa      xmm2,[rsp+10h]
-  movaps      xmm6,[r11-18h]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm15
-  psubw       xmm9,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm15
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  pmaxsw      xmm11,xmm1
-  pabsw       xmm0,xmm0
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm10,xmm0
-  pabsw       xmm0,xmm13
-  pminsw      xmm3,xmm11
-  movaps      xmm11,[r11-68h]
-  movaps      xmm13,[rsp+40h]
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm9
-  movaps      xmm9, [r11-48h]
-  pand        xmm10,xmm1
-  pcmpgtw     xmm8,xmm0
-  pand        xmm10,xmm8
-  pand        xmm10,xmm7
-  movaps      xmm8,[r11-38h]
-  movaps      xmm7,[r11-28h]
-  pand        xmm3,xmm10
-  paddw       xmm15,xmm3
-  psubw       xmm2,xmm3
-  movaps      xmm10,[r11-58h]
-  packuswb    xmm12,xmm15
-  movaps      xmm15,[rsp+20h]
-  packuswb    xmm14,xmm2
-  movq        [rcx],xmm12
-  movq        [rbx],xmm14
-  psrldq      xmm12,8
-  psrldq      xmm14,8
-  movq        [rdx],xmm12
-  movaps      xmm12,[r11-78h]
-  movq        [rdi],xmm14
-  movaps      xmm14,[rsp+30h]
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
-
-WELS_EXTERN DeblockChromaEq4V_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-
-  mov         rbp, r8
-  mov         r8, rdx
-  mov         r9, rcx
-  mov         rcx, rdi
-  mov         rdx, rsi
-
-  sub         rsp,90h
-  pxor        xmm1,xmm1
-  mov         r11,rcx
-  mov         rbx,rdx
-  mov         r10d,r9d
-  movq        xmm13,[r11]
-  lea         eax,[r8+r8]
-  movsxd      r9,eax
-  mov         rax,rcx
-  sub         rax,r9
-  movq        xmm14,[rax]
-  mov         rax,rdx
-  sub         rax,r9
-  movq        xmm0,[rax]
-  movsxd      rax,r8d
-  sub         rcx,rax
-  sub         rdx,rax
-  movq        xmm12,[rax+r11]
-  movq        xmm10,[rcx]
-  punpcklqdq  xmm14,xmm0
-  movdqa      xmm8,xmm14
-  movq        xmm0,[rdx]
-  punpcklbw   xmm8,xmm1
-  punpckhbw   xmm14,xmm1
-  punpcklqdq  xmm10,xmm0
-  movq        xmm0,[rbx]
-  movdqa      xmm5,xmm10
-  punpcklqdq  xmm13,xmm0
-  movq        xmm0, [rax+rbx]
-  punpcklbw   xmm5,xmm1
-  movsx       eax,r10w
-  movdqa      xmm9,xmm13
-  punpcklqdq  xmm12,xmm0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm10,xmm1
-  movd        xmm0,eax
-  mov         eax, ebp   ; iBeta
-  punpckhbw   xmm13,xmm1
-  movdqa      xmm7,xmm12
-  punpcklwd   xmm0,xmm0
-  punpckhbw   xmm12,xmm1
-  pshufd      xmm11,xmm0,0
-  punpcklbw   xmm7,xmm1
-  movd        xmm0,eax
-  movdqa      xmm1,xmm8
-  psubw       xmm1,xmm5
-  punpcklwd   xmm0,xmm0
-  movdqa      xmm6,xmm11
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm5
-  psubw       xmm0,xmm9
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm10
-  movdqa      xmm1,xmm14
-  psubw       xmm0,xmm13
-  psubw       xmm1,xmm10
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm11,xmm0
-  pabsw       xmm0,xmm1
-  pcmpgtw     xmm2,xmm0
-  pand        xmm11,xmm2
-  movdqa      xmm0,xmm12
-  movdqa      xmm4,xmm6
-  movdqa      xmm1,xmm8
-  mov         eax,2
-  cwde
-  paddw       xmm1,xmm8
-  psubw       xmm0,xmm13
-  paddw       xmm1,xmm5
-  pabsw       xmm0,xmm0
-  movdqa      xmm2,xmm14
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm14
-  movd        xmm0,eax
-  pand        xmm11,xmm3
-  paddw       xmm7,xmm7
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  paddw       xmm2,xmm12
-  paddw       xmm12,xmm12
-  pshufd      xmm3,xmm0,0
-  paddw       xmm7,xmm9
-  paddw       xmm12,xmm13
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm5
-  paddw       xmm7,xmm8
-  psraw       xmm1,2
-  paddw       xmm12,xmm14
-  paddw       xmm7,xmm3
-  ;movaps      xmm14,[rsp]
-  pand        xmm4,xmm1
-  paddw       xmm12,xmm3
-  psraw       xmm7,2
-  movdqa      xmm1,xmm11
-  por         xmm4,xmm0
-  psraw       xmm12,2
-  paddw       xmm2,xmm3
-  movdqa      xmm0,xmm11
-  pandn       xmm0,xmm10
-  psraw       xmm2,2
-  pand        xmm1,xmm2
-  por         xmm1,xmm0
-  packuswb    xmm4,xmm1
-  movdqa      xmm0,xmm11
-  movdqa      xmm1,xmm6
-  pand        xmm1,xmm7
-  movq        [rcx],xmm4
-  pandn       xmm6,xmm9
-  pandn       xmm11,xmm13
-  pand        xmm0,xmm12
-  por         xmm1,xmm6
-  por         xmm0,xmm11
-  psrldq      xmm4,8
-  packuswb    xmm1,xmm0
-  movq        [r11],xmm1
-  psrldq      xmm1,8
-  movq        [rdx],xmm4
-  lea         r11,[rsp+90h]
-  movq        [rbx],xmm1
-  mov         rsp,r11
-  pop         rbp
-  pop         rbx
-  ret
-
-WELS_EXTERN DeblockChromaEq4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        r12
-
-  mov         rbp,   r8
-  mov         r8,    rdx
-  mov         r9,    rcx
-  mov         rcx,   rdi
-  mov         rdx,   rsi
-  mov         rdi,   rdx
-
-  sub         rsp,140h
-  lea         eax,[r8*4]
-  movsxd      r10,eax
-  mov         eax,[rcx-2]
-  mov         [rsp+10h],eax
-  lea         rbx,[r10+rdx-2]
-  lea         r11,[r10+rcx-2]
-
-  movdqa      xmm5,[rsp+10h]
-  movsxd      r10,r8d
-  mov         eax,[r10+rcx-2]
-  lea         rdx,[r10+r10*2]
-  mov         [rsp+20h],eax
-  mov         eax,[rcx+r10*2-2]
-  mov         [rsp+30h],eax
-  mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h]
-  mov         [rsp+40h],eax
-  mov         eax, [rdi-2]
-  movdqa      xmm4,[rsp+30h]
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rdi-2]
-  movdqa      xmm3,[rsp+40h]
-  mov         [rsp+60h],eax
-  mov         eax,[rdi+r10*2-2]
-  punpckldq   xmm5,[rsp+50h]
-  mov         [rsp+70h],eax
-  mov         eax, [rdx+rdi-2]
-  punpckldq   xmm2, [rsp+60h]
-  mov          [rsp+80h],eax
-  mov         eax,[r11]
-  punpckldq   xmm4, [rsp+70h]
-  mov         [rsp+50h],eax
-  mov         eax,[rbx]
-  punpckldq   xmm3,[rsp+80h]
-  mov         [rsp+60h],eax
-  mov         eax,[r10+r11]
-  movdqa      xmm0, [rsp+50h]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm0,[rsp+50h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+60h],eax
-  mov         eax,[r11+r10*2]
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax,[rbx+r10*2]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  mov         eax, [rdx+r11]
-  movdqa      xmm15,xmm1
-  punpckldq   xmm0,[rsp+60h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+50h],xmm0
-  mov         [rsp+50h],eax
-  mov         eax, [rdx+rbx]
-  movdqa      xmm0,[rsp+50h]
-  mov         [rsp+60h],eax
-  punpckldq   xmm0, [rsp+60h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm15,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm12,xmm15
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm12,xmm0
-  punpckhdq   xmm15,xmm0
-  movdqa      xmm0,xmm1
-  movdqa      xmm11,xmm12
-  punpckldq   xmm0,xmm5
-  punpckhdq   xmm1,xmm5
-  punpcklqdq  xmm11,xmm0
-  punpckhqdq  xmm12,xmm0
-  movsx       eax,r9w
-  movdqa      xmm14,xmm15
-  punpcklqdq  xmm14,xmm1
-  punpckhqdq  xmm15,xmm1
-  pxor        xmm1,xmm1
-  movd        xmm0,eax
-  movdqa      xmm4,xmm12
-  movdqa      xmm8,xmm11
-  mov         eax, ebp ; iBeta
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm4,xmm1
-  punpckhbw   xmm12,xmm1
-  movdqa      xmm9,xmm14
-  movdqa      xmm7,xmm15
-  movdqa      xmm10,xmm15
-  pshufd      xmm13,xmm0,0
-  punpcklbw   xmm9,xmm1
-  punpckhbw   xmm14,xmm1
-  movdqa      xmm6,xmm13
-  movd        xmm0,eax
-  movdqa      [rsp],xmm11
-  mov         eax,2
-  cwde
-  punpckhbw   xmm11,xmm1
-  punpckhbw   xmm10,xmm1
-  punpcklbw   xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  punpcklbw   xmm8,xmm1
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm1,xmm8
-  movdqa      xmm0,xmm4
-  psubw       xmm0,xmm9
-  psubw       xmm1,xmm4
-  movdqa      xmm2,xmm3
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm6,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm3
-  pcmpgtw     xmm2,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm0,xmm7
-  movdqa      xmm2,xmm3
-  psubw       xmm0,xmm9
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm1,xmm0
-  pand        xmm6,xmm1
-  movdqa      xmm0,xmm12
-  movdqa      xmm1,xmm11
-  psubw       xmm0,xmm14
-  psubw       xmm1,xmm12
-  movdqa      xmm5,xmm6
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm13,xmm0
-  pabsw       xmm0,xmm1
-  movdqa      xmm1,xmm8
-  pcmpgtw     xmm2,xmm0
-  paddw       xmm1,xmm8
-  movdqa      xmm0,xmm10
-  pand        xmm13,xmm2
-  psubw       xmm0,xmm14
-  paddw       xmm1,xmm4
-  movdqa      xmm2,xmm11
-  pabsw       xmm0,xmm0
-  paddw       xmm2,xmm11
-  paddw       xmm1,xmm7
-  pcmpgtw     xmm3,xmm0
-  paddw       xmm2,xmm12
-  movd        xmm0,eax
-  pand        xmm13,xmm3
-  paddw       xmm2,xmm10
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm3,xmm0,0
-  movdqa      xmm0,xmm6
-  paddw       xmm1,xmm3
-  pandn       xmm0,xmm4
-  paddw       xmm2,xmm3
-  psraw       xmm1,2
-  pand        xmm5,xmm1
-  por         xmm5,xmm0
-  paddw       xmm7,xmm7
-  paddw       xmm10,xmm10
-  psraw       xmm2,2
-  movdqa      xmm1,xmm13
-  movdqa      xmm0,xmm13
-  pandn       xmm0,xmm12
-  pand        xmm1,xmm2
-  paddw       xmm7,xmm9
-  por         xmm1,xmm0
-  paddw       xmm10,xmm14
-  paddw       xmm7,xmm8
-  movdqa      xmm0,xmm13
-  packuswb    xmm5,xmm1
-  paddw       xmm7,xmm3
-  paddw       xmm10,xmm11
-  movdqa      xmm1,xmm6
-  paddw       xmm10,xmm3
-  pandn       xmm6,xmm9
-  psraw       xmm7,2
-  pand        xmm1,xmm7
-  psraw       xmm10,2
-  pandn       xmm13,xmm14
-  pand        xmm0,xmm10
-  por         xmm1,xmm6
-  movdqa      xmm6,[rsp]
-  movdqa      xmm4,xmm6
-  por         xmm0,xmm13
-  punpcklbw   xmm4,xmm5
-  punpckhbw   xmm6,xmm5
-  movdqa      xmm3,xmm4
-  packuswb    xmm1,xmm0
-  movdqa      xmm0,xmm1
-  punpckhbw   xmm1,xmm15
-  punpcklbw   xmm0,xmm15
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm6
-  movdqa      xmm2,xmm3
-  punpcklwd   xmm0,xmm1
-  punpckhwd   xmm6,xmm1
-  movdqa      xmm1,xmm4
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm6
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm6
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+10h],xmm0
-  movdqa      [rsp+60h],xmm2
-  movdqa      xmm0,xmm3
-  mov         eax,[rsp+10h]
-  mov         [rcx-2],eax
-  mov         eax,[rsp+60h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [r10+rcx-2],eax
-  movdqa      [rsp+20h],xmm0
-  mov         eax, [rsp+20h]
-  movdqa      [rsp+70h],xmm3
-  mov         [rcx+r10*2-2],eax
-  mov         eax,[rsp+70h]
-  mov         [rdx+rcx-2],eax
-  mov         eax,[rsp+18h]
-  mov         [r11],eax
-  mov         eax,[rsp+68h]
-  mov         [r10+r11],eax
-  mov         eax,[rsp+28h]
-  mov         [r11+r10*2],eax
-  mov         eax,[rsp+78h]
-  mov         [rdx+r11],eax
-  mov         eax,[rsp+14h]
-  mov         [rdi-2],eax
-  mov         eax,[rsp+64h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+24h]
-  mov         [rdi+r10*2-2],eax
-  mov         eax, [rsp+74h]
-  mov         [rdx+rdi-2],eax
-  mov         eax, [rsp+1Ch]
-  mov         [rbx],eax
-  mov         eax, [rsp+6Ch]
-  mov         [r10+rbx],eax
-  mov         eax,[rsp+2Ch]
-  mov         [rbx+r10*2],eax
-  mov         eax,[rsp+7Ch]
-  mov         [rdx+rbx],eax
-  lea         r11,[rsp+140h]
-  mov         rbx, [r11+28h]
-  mov         rsp,r11
-  pop         r12
-  pop         rbp
-  pop         rbx
-  ret
-
-
-WELS_EXTERN DeblockChromaLt4H_ssse3
-  mov         rax,rsp
-  push        rbx
-  push        rbp
-  push        r12
-  push        r13
-  push        r14
-  sub         rsp,170h
-
-  mov         r13,   r8
-  mov         r14,   r9
-  mov         r8,    rdx
-  mov         r9,    rcx
-  mov         rdx,   rdi
-  mov         rcx,   rsi
-
-  movsxd      rsi,r8d
-  lea         eax,[r8*4]
-  mov         r11d,r9d
-  movsxd      r10,eax
-  mov         eax, [rcx-2]
-  mov         r12,rdx
-  mov         [rsp+40h],eax
-  mov         eax, [rsi+rcx-2]
-  lea         rbx,[r10+rcx-2]
-  movdqa      xmm5,[rsp+40h]
-  mov         [rsp+50h],eax
-  mov         eax, [rcx+rsi*2-2]
-  lea         rbp,[r10+rdx-2]
-  movdqa      xmm2, [rsp+50h]
-  mov         [rsp+60h],eax
-  lea         r10,[rsi+rsi*2]
-  mov         rdi,rcx
-  mov         eax,[r10+rcx-2]
-  movdqa      xmm4,[rsp+60h]
-  mov         [rsp+70h],eax
-  mov         eax,[rdx-2]
-  mov         [rsp+80h],eax
-  mov         eax, [rsi+rdx-2]
-  movdqa      xmm3,[rsp+70h]
-  mov         [rsp+90h],eax
-  mov         eax,[rdx+rsi*2-2]
-  punpckldq   xmm5,[rsp+80h]
-  mov         [rsp+0A0h],eax
-  mov         eax, [r10+rdx-2]
-  punpckldq   xmm2,[rsp+90h]
-  mov         [rsp+0B0h],eax
-  mov         eax, [rbx]
-  punpckldq   xmm4,[rsp+0A0h]
-  mov         [rsp+80h],eax
-  mov         eax,[rbp]
-  punpckldq   xmm3,[rsp+0B0h]
-  mov         [rsp+90h],eax
-  mov         eax,[rsi+rbx]
-  movdqa      xmm0,[rsp+80h]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm5,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rsi+rbp]
-  movdqa      xmm0,[rsp+80h]
-  movdqa      xmm1,xmm5
-  mov         [rsp+90h],eax
-  mov         eax,[rbx+rsi*2]
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm2,xmm0
-  punpcklbw   xmm1,xmm2
-  punpckhbw   xmm5,xmm2
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax,[rbp+rsi*2]
-  movdqa      xmm0, [rsp+80h]
-  mov         [rsp+90h],eax
-  mov         eax,[r10+rbx]
-  movdqa      xmm7,xmm1
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm4,xmm0
-  movdqa      [rsp+80h],xmm0
-  mov         [rsp+80h],eax
-  mov         eax, [r10+rbp]
-  movdqa      xmm0,[rsp+80h]
-  mov         [rsp+90h],eax
-  punpckldq   xmm0,[rsp+90h]
-  punpcklqdq  xmm3,xmm0
-  movdqa      xmm0,xmm4
-  punpcklbw   xmm0,xmm3
-  punpckhbw   xmm4,xmm3
-  punpcklwd   xmm7,xmm0
-  punpckhwd   xmm1,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm6,xmm7
-  punpcklwd   xmm0,xmm4
-  punpckhwd   xmm5,xmm4
-  punpckldq   xmm6,xmm0
-  punpckhdq   xmm7,xmm0
-  movdqa      xmm0,xmm1
-  punpckldq   xmm0,xmm5
-  mov         rax, r14    ; pTC
-  punpckhdq   xmm1,xmm5
-  movdqa      xmm9,xmm6
-  punpckhqdq  xmm6,xmm0
-  punpcklqdq  xmm9,xmm0
-  movdqa      xmm2,xmm7
-  movdqa      xmm13,xmm6
-  movdqa      xmm4,xmm9
-  movdqa      [rsp+10h],xmm9
-  punpcklqdq  xmm2,xmm1
-  punpckhqdq  xmm7,xmm1
-  pxor        xmm1,xmm1
-  movsx       ecx,byte [rax+3]
-  movsx       edx,byte [rax+2]
-  movsx       r8d,byte [rax+1]
-  movsx       r9d,byte [rax]
-  movdqa      xmm10,xmm1
-  movdqa      xmm15,xmm2
-  punpckhbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm4,xmm1
-  movsx       eax,r11w
-  mov         word [rsp+0Eh],cx
-  mov         word [rsp+0Ch],cx
-  movdqa      xmm3,xmm7
-  movdqa      xmm8,xmm7
-  movdqa      [rsp+20h],xmm7
-  punpcklbw   xmm15,xmm1
-  punpcklbw   xmm13,xmm1
-  punpcklbw   xmm3,xmm1
-  mov         word [rsp+0Ah],dx
-  mov         word [rsp+8],dx
-  mov         word [rsp+6],r8w
-  movd        xmm0,eax
-  movdqa      [rsp+30h],xmm6
-  punpckhbw   xmm9,xmm1
-  punpckhbw   xmm8,xmm1
-  punpcklwd   xmm0,xmm0
-  mov         eax, r13d   ; iBeta
-  mov         word [rsp+4],r8w
-  mov         word [rsp+2],r9w
-  pshufd      xmm12,xmm0,0
-  mov         word [rsp],r9w
-  movd        xmm0,eax
-  mov         eax,4
-  cwde
-  movdqa      xmm14, [rsp]
-  movdqa      [rsp],xmm2
-  movdqa      xmm2,xmm12
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm11,xmm0,0
-  psubw       xmm10,xmm14
-  movd        xmm0,eax
-  movdqa      xmm7,xmm14
-  movdqa      xmm6,xmm14
-  pcmpgtw     xmm7,xmm1
-  punpcklwd   xmm0,xmm0
-  pshufd      xmm5,xmm0,0
-  movdqa      xmm0,xmm4
-  movdqa      xmm1,xmm15
-  psubw       xmm4,xmm13
-  psubw       xmm0,xmm3
-  psubw       xmm1,xmm13
-  psubw       xmm3,xmm15
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm10
-  psraw       xmm1,3
-  pmaxsw      xmm0,xmm1
-  pminsw      xmm6,xmm0
-  movdqa      xmm1,xmm11
-  movdqa      xmm0,xmm13
-  psubw       xmm0,xmm15
-  pabsw       xmm0,xmm0
-  pcmpgtw     xmm2,xmm0
-  pabsw       xmm0,xmm4
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm3
-  pand        xmm2,xmm1
-  movdqa      xmm1,xmm11
-  movdqa      xmm3,[rsp+30h]
-  pcmpgtw     xmm1,xmm0
-  movdqa      xmm0,xmm9
-  pand        xmm2,xmm1
-  psubw       xmm0,xmm8
-  psubw       xmm9,xmm3
-  pand        xmm2,xmm7
-  pand        xmm6,xmm2
-  psubw       xmm15,xmm6
-  paddw       xmm13,xmm6
-  movdqa      xmm2,[rsp]
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  psubw       xmm8,xmm2
-  psllw       xmm1,2
-  paddw       xmm1,xmm0
-  paddw       xmm1,xmm5
-  movdqa      xmm0,xmm3
-  movdqa      xmm5,[rsp+10h]
-  psubw       xmm0,xmm2
-  psraw       xmm1,3
-  movdqa      xmm4,xmm5
-  pabsw       xmm0,xmm0
-  pmaxsw      xmm10,xmm1
-  movdqa      xmm1,xmm11
-  pcmpgtw     xmm12,xmm0
-  pabsw       xmm0,xmm9
-  pminsw      xmm14,xmm10
-  pcmpgtw     xmm1,xmm0
-  pabsw       xmm0,xmm8
-  pcmpgtw     xmm11,xmm0
-  pand        xmm12,xmm1
-  movdqa      xmm1,[rsp+20h]
-  pand        xmm12,xmm11
-  pand        xmm12,xmm7
-  pand        xmm14,xmm12
-  paddw       xmm3,xmm14
-  psubw       xmm2,xmm14
-  packuswb    xmm13,xmm3
-  packuswb    xmm15,xmm2
-  punpcklbw   xmm4,xmm13
-  punpckhbw   xmm5,xmm13
-  movdqa      xmm0,xmm15
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm15,xmm1
-  movdqa      xmm3,xmm4
-  punpcklwd   xmm3,xmm0
-  punpckhwd   xmm4,xmm0
-  movdqa      xmm0,xmm5
-  movdqa      xmm2,xmm3
-  movdqa      xmm1,xmm4
-  punpcklwd   xmm0,xmm15
-  punpckhwd   xmm5,xmm15
-  punpckldq   xmm2,xmm0
-  punpckhdq   xmm3,xmm0
-  punpckldq   xmm1,xmm5
-  movdqa      xmm0,xmm2
-  punpcklqdq  xmm0,xmm1
-  punpckhdq   xmm4,xmm5
-  punpckhqdq  xmm2,xmm1
-  movdqa      [rsp+40h],xmm0
-  movdqa      xmm0,xmm3
-  movdqa      [rsp+90h],xmm2
-  mov         eax,[rsp+40h]
-  mov         [rdi-2],eax
-  mov         eax, [rsp+90h]
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm3,xmm4
-  mov         [rsi+rdi-2],eax
-  movdqa      [rsp+50h],xmm0
-  mov         eax,[rsp+50h]
-  movdqa      [rsp+0A0h],xmm3
-  mov         [rdi+rsi*2-2],eax
-  mov         eax,[rsp+0A0h]
-  mov         [r10+rdi-2],eax
-  mov         eax,[rsp+48h]
-  mov         [rbx],eax
-  mov         eax,[rsp+98h]
-  mov         [rsi+rbx],eax
-  mov         eax,[rsp+58h]
-  mov         [rbx+rsi*2],eax
-  mov         eax, [rsp+0A8h]
-  mov         [r10+rbx],eax
-  mov         eax, [rsp+44h]
-  mov         [r12-2],eax
-  mov         eax,[rsp+94h]
-  mov         [rsi+r12-2],eax
-  mov         eax,[rsp+54h]
-  mov         [r12+rsi*2-2],eax
-  mov         eax, [rsp+0A4h]
-  mov         [r10+r12-2],eax
-  mov         eax,[rsp+4Ch]
-  mov         [rbp],eax
-  mov         eax,[rsp+9Ch]
-  mov         [rsi+rbp],eax
-  mov         eax, [rsp+5Ch]
-  mov         [rbp+rsi*2],eax
-  mov         eax,[rsp+0ACh]
-  mov         [r10+rbp],eax
-  lea         r11,[rsp+170h]
-  mov         rsp,r11
-  pop         r14
-  pop         r13
-  pop         r12
-  pop         rbp
-  pop         rbx
-  ret
-
-
-
-%elifdef  X86_32
-
-;********************************************************************************
-;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,68h
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx]
-  movq        xmm5,[edx+ecx]
-  push        esi
-  push        edi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  movq        xmm1,[edi]
-  mov         edi,ecx
-  sub         edi,esi
-  movq        xmm2,[edi]
-  punpcklqdq  xmm1,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm2,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm3,[edi]
-  punpcklqdq  xmm2,xmm3
-  movq        xmm3,[eax]
-  punpcklqdq  xmm3,xmm4
-  movq        xmm4,[edx+eax]
-  mov       edx, [ebp + 14h]
-  punpcklqdq  xmm4,xmm5
-  movd        xmm5,edx
-  mov       edx, [ebp + 18h]
-  pxor        xmm0,xmm0
-  movdqa      xmm6,xmm5
-  punpcklwd   xmm6,xmm5
-  pshufd      xmm5,xmm6,0
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,xmm1
-  punpckhbw   xmm1,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+40h],xmm1
-  movdqa      [esp+60h],xmm7
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+10h],xmm7
-  movdqa      xmm7,xmm3
-  punpcklbw   xmm7,xmm0
-  punpckhbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm7,xmm4
-  punpckhbw   xmm4,xmm0
-  punpckhbw   xmm2,xmm0
-  punpcklbw   xmm7,xmm0
-  movdqa      [esp+30h],xmm3
-  movdqa      xmm3,[esp+10h]
-  movdqa      xmm1,xmm3
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      [esp+20h],xmm4
-  movdqa      xmm0,xmm5
-  pcmpgtw     xmm0,xmm1
-  movdqa      xmm1,[esp+60h]
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  pand        xmm0,xmm4
-  movdqa      xmm1,xmm7
-  psubw       xmm1,[esp+50h]
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,xmm2
-  psubw       xmm1,[esp+30h]
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  pand        xmm0,xmm4
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  movdqa      xmm4,xmm6
-  pcmpgtw     xmm4,xmm1
-  movdqa      xmm1,[esp+20h]
-  psubw       xmm1,[esp+30h]
-  pand        xmm5,xmm4
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  pand        xmm5,xmm6
-  mov         edx,2
-  movsx       edx,dx
-  movd        xmm1,edx
-  movdqa      xmm4,xmm1
-  punpcklwd   xmm4,xmm1
-  pshufd      xmm1,xmm4,0
-  movdqa      xmm4,[esp+60h]
-  movdqa      xmm6,xmm4
-  paddw       xmm6,xmm4
-  paddw       xmm6,xmm3
-  paddw       xmm6,xmm7
-  movdqa      [esp+10h],xmm1
-  paddw       xmm6,[esp+10h]
-  psraw       xmm6,2
-  movdqa      xmm4,xmm0
-  pandn       xmm4,xmm3
-  movdqa      xmm3,[esp+40h]
-  movdqa      xmm1,xmm0
-  pand        xmm1,xmm6
-  por         xmm1,xmm4
-  movdqa      xmm6,xmm3
-  paddw       xmm6,xmm3
-  movdqa      xmm3,[esp+10h]
-  paddw       xmm6,xmm2
-  paddw       xmm6,[esp+20h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm4,xmm5
-  pand        xmm4,xmm6
-  movdqa      xmm6,xmm5
-  pandn       xmm6,xmm2
-  por         xmm4,xmm6
-  packuswb    xmm1,xmm4
-  movdqa      xmm4,[esp+50h]
-  movdqa      xmm6,xmm7
-  paddw       xmm6,xmm7
-  paddw       xmm6,xmm4
-  paddw       xmm6,[esp+60h]
-  paddw       xmm6,xmm3
-  psraw       xmm6,2
-  movdqa      xmm2,xmm0
-  pand        xmm2,xmm6
-  pandn       xmm0,xmm4
-  por         xmm2,xmm0
-  movdqa      xmm0,[esp+20h]
-  movdqa      xmm6,xmm0
-  paddw       xmm6,xmm0
-  movdqa      xmm0,[esp+30h]
-  paddw       xmm6,xmm0
-  paddw       xmm6,[esp+40h]
-  movdqa      xmm4,xmm5
-  paddw       xmm6,xmm3
-  movq        [esi],xmm1
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  pandn       xmm5,xmm0
-  por         xmm4,xmm5
-  packuswb    xmm2,xmm4
-  movq        [eax],xmm2
-  psrldq      xmm1,8
-  movq        [edi],xmm1
-  pop         edi
-  psrldq      xmm2,8
-  movq        [ecx],xmm2
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0E4h
-  push        ebx
-  push        esi
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2]
-  push        edi
-  movsx       di,byte [esi+3]
-  mov         word [esp+0Ch],bx
-  movsx       bx,byte  [esi+1]
-  movsx       esi,byte  [esi]
-  mov         word  [esp+0Eh],si
-  movzx       esi,di
-  movd        xmm1,esi
-  movzx       esi,di
-  movd        xmm2,esi
-  mov         si,word  [esp+0Ch]
-  mov         edx, [ebp + 10h]
-  mov         eax, [ebp + 08h]
-  movzx       edi,si
-  movzx       esi,si
-  mov         ecx, [ebp + 0Ch]
-  movd        xmm4,esi
-  movzx       esi,bx
-  movd        xmm5,esi
-  movd        xmm3,edi
-  movzx       esi,bx
-  movd        xmm6,esi
-  mov         si,word [esp+0Eh]
-  movzx       edi,si
-  movzx       esi,si
-  punpcklwd   xmm6,xmm2
-  pxor        xmm0,xmm0
-  movdqa      [esp+40h],xmm0
-  movd        xmm7,edi
-  movd        xmm0,esi
-  lea         esi,[edx+edx]
-  mov         edi,eax
-  sub         edi,esi
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+40h]
-  punpcklwd   xmm0,xmm4
-  movq        xmm4,[edx+ecx]
-  punpcklwd   xmm7,xmm3
-  movq        xmm3,[eax]
-  punpcklwd   xmm0,xmm6
-  movq        xmm6,[edi]
-  punpcklwd   xmm7,xmm5
-  punpcklwd   xmm0,xmm7
-  mov         edi,ecx
-  sub         edi,esi
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+60h],xmm2
-  movq        xmm2, [edi]
-  punpcklqdq  xmm6,xmm2
-  mov         esi,eax
-  sub         esi,edx
-  movq        xmm7,[esi]
-  mov         edi,ecx
-  sub         edi,edx
-  movq        xmm2,[edi]
-  punpcklqdq  xmm7,xmm2
-  movq        xmm2,[ecx]
-  punpcklqdq  xmm3,xmm2
-  movq        xmm2,[edx+eax]
-  movsx       edx,word [ebp + 14h]
-  punpcklqdq  xmm2,xmm4
-  movdqa      [esp+0E0h],xmm2
-  movd        xmm2,edx
-  movsx       edx,word [ebp + 18h]
-  movdqa      xmm4,xmm2
-  punpcklwd   xmm4,xmm2
-  movd        xmm2,edx
-  movdqa      xmm5,xmm2
-  punpcklwd   xmm5,xmm2
-  pshufd      xmm2,xmm5,0
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  movdqa      [esp+0D0h],xmm3
-  pshufd      xmm4,xmm4,0
-  movdqa      [esp+30h],xmm2
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+80h],xmm6
-  movdqa      xmm6,[esp+0D0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+70h],xmm6
-  movdqa      xmm6, [esp+0E0h]
-  punpckhbw   xmm6,xmm1
-  movdqa     [esp+90h],xmm6
-  movdqa      xmm5, [esp+0E0h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa       [esp+0A0h],xmm7
-  punpcklbw   xmm3,xmm1
-  mov         edx,4
-  punpcklbw   xmm2,xmm1
-  movsx       edx,dx
-  movd        xmm6,edx
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      xmm7,[esp+30h]
-  movdqa      [esp+20h],xmm6
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1,[esp+60h]
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6, [esp+20h]
-  movdqa      xmm7, [esp+50h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      [esp+10h],xmm0
-  movdqa      xmm6, [esp+10h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+10h],xmm6
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  movdqa      xmm6,xmm4
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+30h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1,[esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5,[esp+80h]
-  psubw       xmm5,[esp+90h]
-  pand        xmm6,xmm1
-  pand        xmm6,[esp+40h]
-  movdqa      xmm1,[esp+10h]
-  pand        xmm1,xmm6
-  movdqa      xmm6,[esp+70h]
-  movdqa      [esp+30h],xmm1
-  movdqa      xmm1,[esp+0A0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6,[esp+20h]
-  movdqa      xmm5,[esp+60h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+70h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+80h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+90h]
-  pand        xmm4,xmm7
-  movdqa      xmm7,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+40h]
-  pand        xmm0,xmm4
-  movdqa      xmm4,[esp+30h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  packuswb    xmm2,xmm1
-  movq        [esi],xmm2
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm3,xmm5
-  movq        [eax],xmm3
-  psrldq      xmm2,8
-  movq        [edi],xmm2
-  pop         edi
-  pop         esi
-  psrldq      xmm3,8
-  movq        [ecx],xmm3
-  pop         ebx
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;***************************************************************************
-;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,0C8h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+7Ch]
-  push        edi
-  mov         dword [esp+14h],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+0Ch],edx
-  mov         dword [esp+10h],eax
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword  [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+0Ch]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+10h]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  movsx       ecx,word [ebp+14h]
-  movsx       edx,word [ebp+18h]
-  movdqa      xmm6,[esp+80h]
-  movdqa      xmm4,[esp+90h]
-  movdqa      xmm5,[esp+0A0h]
-  movdqa      xmm7,[esp+0B0h]
-  pxor        xmm0,xmm0
-  movd        xmm1,ecx
-  movdqa      xmm2,xmm1
-  punpcklwd   xmm2,xmm1
-  pshufd      xmm1,xmm2,0
-  movd        xmm2,edx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3,xmm6
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm6,[esp+90h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm6,[esp+0A0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+40h],xmm6
-  movdqa      xmm6,[esp+0B0h]
-  punpckhbw   xmm6,xmm0
-  movdqa      [esp+70h],xmm6
-  punpcklbw   xmm7,xmm0
-  punpcklbw   xmm4,xmm0
-  punpcklbw   xmm5,xmm0
-  punpcklbw   xmm3,xmm0
-  movdqa      [esp+50h],xmm7
-  movdqa      xmm6,xmm4
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  movdqa      xmm0,xmm1
-  pcmpgtw     xmm0,xmm6
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm4
-  pabsw       xmm6,xmm6
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+30h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm1,xmm6
-  movdqa      xmm6,[esp+60h]
-  psubw       xmm6,[esp+30h]
-  pabsw       xmm6,xmm6
-  pand        xmm0,xmm7
-  movdqa      xmm7,xmm2
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6,[esp+70h]
-  psubw       xmm6,[esp+40h]
-  pabsw       xmm6,xmm6
-  pand        xmm1,xmm7
-  pcmpgtw     xmm2,xmm6
-  pand        xmm1,xmm2
-  mov         eax,2
-  movsx       ecx,ax
-  movd        xmm2,ecx
-  movdqa      xmm6,xmm2
-  punpcklwd   xmm6,xmm2
-  pshufd      xmm2,xmm6,0
-  movdqa      [esp+20h],xmm2
-  movdqa      xmm2,xmm3
-  paddw       xmm2,xmm3
-  paddw       xmm2,xmm4
-  paddw       xmm2,[esp+50h]
-  paddw       xmm2,[esp+20h]
-  psraw       xmm2,2
-  movdqa      xmm6,xmm0
-  pand        xmm6,xmm2
-  movdqa      xmm2,xmm0
-  pandn       xmm2,xmm4
-  por         xmm6,xmm2
-  movdqa      xmm2,[esp+60h]
-  movdqa      xmm7,xmm2
-  paddw       xmm7,xmm2
-  paddw       xmm7,[esp+30h]
-  paddw       xmm7,[esp+70h]
-  paddw       xmm7,[esp+20h]
-  movdqa      xmm4,xmm1
-  movdqa      xmm2,xmm1
-  pandn       xmm2,[esp+30h]
-  psraw       xmm7,2
-  pand        xmm4,xmm7
-  por         xmm4,xmm2
-  movdqa      xmm2,[esp+50h]
-  packuswb    xmm6,xmm4
-  movdqa      [esp+90h],xmm6
-  movdqa      xmm6,xmm2
-  paddw       xmm6,xmm2
-  movdqa      xmm2,[esp+20h]
-  paddw       xmm6,xmm5
-  paddw       xmm6,xmm3
-  movdqa      xmm4,xmm0
-  pandn       xmm0,xmm5
-  paddw       xmm6,xmm2
-  psraw       xmm6,2
-  pand        xmm4,xmm6
-  por         xmm4,xmm0
-  movdqa      xmm0,[esp+70h]
-  movdqa      xmm5,xmm0
-  paddw       xmm5,xmm0
-  movdqa      xmm0,[esp+40h]
-  paddw       xmm5,xmm0
-  paddw       xmm5,[esp+60h]
-  movdqa      xmm3,xmm1
-  paddw       xmm5,xmm2
-  psraw       xmm5,2
-  pand        xmm3,xmm5
-  pandn       xmm1,xmm0
-  por         xmm3,xmm1
-  packuswb    xmm4,xmm3
-  movdqa      [esp+0A0h],xmm4
-  mov         esi,dword [esp+10h]
-  movdqa      xmm0,[esi]
-  movdqa      xmm1,[esi+10h]
-  movdqa      xmm2,[esi+20h]
-  movdqa      xmm3,[esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+1Ch]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+14h]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+0Ch]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-;*******************************************************************************
-;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4H_ssse3
-  push        ebp
-  mov         ebp,esp
-  and         esp,0FFFFFFF0h
-  sub         esp,108h
-  mov         ecx,dword [ebp+8]
-  mov         edx,dword [ebp+0Ch]
-  mov         eax,dword [ebp+10h]
-  sub         ecx,2
-  sub         edx,2
-  push        esi
-  lea         esi,[eax+eax*2]
-  mov         dword [esp+10h],ecx
-  mov         dword [esp+4],edx
-  lea         ecx,[ecx+eax*4]
-  lea         edx,[edx+eax*4]
-  lea         eax,[esp+6Ch]
-  push        edi
-  mov         dword [esp+0Ch],esi
-  mov         dword [esp+18h],ecx
-  mov         dword [esp+10h],edx
-  mov         dword [esp+1Ch],eax
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  movd        xmm0,dword [esi]
-  movd        xmm1,dword [esi+ecx]
-  movd        xmm2,dword [esi+ecx*2]
-  movd        xmm3,dword [esi+edx]
-  mov         esi,dword [esp+8]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [esi+ecx]
-  movd        xmm6,dword [esi+ecx*2]
-  movd        xmm7,dword [esi+edx]
-  punpckldq   xmm0,xmm4
-  punpckldq   xmm1,xmm5
-  punpckldq   xmm2,xmm6
-  punpckldq   xmm3,xmm7
-  mov         esi,dword [esp+18h]
-  mov         edi,dword [esp+10h]
-  movd        xmm4,dword [esi]
-  movd        xmm5,dword [edi]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm0,xmm4
-  movd        xmm4,dword [esi+ecx]
-  movd        xmm5,dword [edi+ecx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm1,xmm4
-  movd        xmm4,dword [esi+ecx*2]
-  movd        xmm5,dword [edi+ecx*2]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm2,xmm4
-  movd        xmm4,dword [esi+edx]
-  movd        xmm5,dword [edi+edx]
-  punpckldq   xmm4,xmm5
-  punpcklqdq  xmm3,xmm4
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         edi,dword [esp+1Ch]
-  movdqa      [edi],xmm0
-  movdqa      [edi+10h],xmm5
-  movdqa      [edi+20h],xmm1
-  movdqa      [edi+30h],xmm6
-  mov         eax,dword [ebp+1Ch]
-  movsx       cx,byte [eax+3]
-  movsx       dx,byte [eax+2]
-  movsx       si,byte [eax+1]
-  movsx       ax,byte [eax]
-  movzx       edi,cx
-  movzx       ecx,cx
-  movd        xmm2,ecx
-  movzx       ecx,dx
-  movzx       edx,dx
-  movd        xmm3,ecx
-  movd        xmm4,edx
-  movzx       ecx,si
-  movzx       edx,si
-  movd        xmm5,ecx
-  pxor        xmm0,xmm0
-  movd        xmm6,edx
-  movzx       ecx,ax
-  movdqa      [esp+60h],xmm0
-  movzx       edx,ax
-  movsx       eax,word [ebp+14h]
-  punpcklwd   xmm6,xmm2
-  movd        xmm1,edi
-  movd        xmm7,ecx
-  movsx       ecx,word [ebp+18h]
-  movd        xmm0,edx
-  punpcklwd   xmm7,xmm3
-  punpcklwd   xmm5,xmm1
-  movdqa      xmm1,[esp+60h]
-  punpcklwd   xmm7,xmm5
-  movdqa      xmm5,[esp+0A0h]
-  punpcklwd   xmm0,xmm4
-  punpcklwd   xmm0,xmm6
-  movdqa      xmm6, [esp+70h]
-  punpcklwd   xmm0,xmm7
-  movdqa      xmm7,[esp+80h]
-  movdqa      xmm2,xmm1
-  psubw       xmm2,xmm0
-  movdqa      [esp+0D0h],xmm2
-  movd        xmm2,eax
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm4,xmm3,0
-  movd        xmm2,ecx
-  movdqa      xmm3,xmm2
-  punpcklwd   xmm3,xmm2
-  pshufd      xmm2,xmm3,0
-  movdqa      xmm3, [esp+90h]
-  movdqa      [esp+50h],xmm2
-  movdqa      xmm2,xmm6
-  punpcklbw   xmm2,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+40h],xmm2
-  movdqa      [esp+0B0h],xmm6
-  movdqa      xmm6,[esp+90h]
-  movdqa      xmm2,xmm7
-  punpckhbw   xmm7,xmm1
-  punpckhbw   xmm6,xmm1
-  punpcklbw   xmm2,xmm1
-  punpcklbw   xmm3,xmm1
-  punpcklbw   xmm5,xmm1
-  movdqa      [esp+0F0h],xmm7
-  movdqa      [esp+0C0h],xmm6
-  movdqa      xmm6, [esp+0A0h]
-  punpckhbw   xmm6,xmm1
-  movdqa      [esp+0E0h],xmm6
-  mov         edx,4
-  movsx       eax,dx
-  movd        xmm6,eax
-  movdqa      xmm7,xmm6
-  punpcklwd   xmm7,xmm6
-  pshufd      xmm6,xmm7,0
-  movdqa      [esp+30h],xmm6
-  movdqa      xmm7, [esp+40h]
-  psubw       xmm7,xmm5
-  movdqa      xmm6,xmm0
-  pcmpgtw     xmm6,xmm1
-  movdqa      [esp+60h],xmm6
-  movdqa      xmm1, [esp+0D0h]
-  movdqa      xmm6,xmm3
-  psubw       xmm6,xmm2
-  psllw       xmm6,2
-  paddw       xmm6,xmm7
-  paddw       xmm6,[esp+30h]
-  psraw       xmm6,3
-  pmaxsw      xmm1,xmm6
-  movdqa      xmm7,[esp+50h]
-  movdqa      [esp+20h],xmm0
-  movdqa      xmm6, [esp+20h]
-  pminsw      xmm6,xmm1
-  movdqa      [esp+20h],xmm6
-  movdqa      xmm6,xmm4
-  movdqa      xmm1,xmm2
-  psubw       xmm1,xmm3
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm6,xmm1
-  movdqa      xmm1, [esp+40h]
-  psubw       xmm1,xmm2
-  pabsw       xmm1,xmm1
-  pcmpgtw     xmm7,xmm1
-  movdqa      xmm1, [esp+50h]
-  pand        xmm6,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm5,xmm3
-  pabsw       xmm5,xmm5
-  pcmpgtw     xmm1,xmm5
-  movdqa      xmm5, [esp+0B0h]
-  psubw       xmm5,[esp+0E0h]
-  pand        xmm6,xmm1
-  pand        xmm6, [esp+60h]
-  movdqa      xmm1, [esp+20h]
-  pand        xmm1,xmm6
-  movdqa      xmm6, [esp+0C0h]
-  movdqa      [esp+40h],xmm1
-  movdqa      xmm1, [esp+0F0h]
-  psubw       xmm6,xmm1
-  psllw       xmm6,2
-  paddw       xmm6,xmm5
-  paddw       xmm6, [esp+30h]
-  movdqa      xmm5, [esp+0D0h]
-  psraw       xmm6,3
-  pmaxsw      xmm5,xmm6
-  pminsw      xmm0,xmm5
-  movdqa      xmm5,[esp+0C0h]
-  movdqa      xmm6,xmm1
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm4,xmm6
-  movdqa      xmm6,[esp+0B0h]
-  psubw       xmm6,xmm1
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  movdqa      xmm6, [esp+0E0h]
-  pand        xmm4,xmm7
-  movdqa      xmm7, [esp+50h]
-  psubw       xmm6,xmm5
-  pabsw       xmm6,xmm6
-  pcmpgtw     xmm7,xmm6
-  pand        xmm4,xmm7
-  pand        xmm4,[esp+60h]
-  pand        xmm0,xmm4
-  movdqa      xmm4, [esp+40h]
-  paddw       xmm2,xmm4
-  paddw       xmm1,xmm0
-  psubw       xmm3,xmm4
-  psubw       xmm5,xmm0
-  packuswb    xmm2,xmm1
-  packuswb    xmm3,xmm5
-  movdqa      [esp+80h],xmm2
-  movdqa      [esp+90h],xmm3
-  mov         esi,dword [esp+1Ch]
-  movdqa      xmm0, [esi]
-  movdqa      xmm1, [esi+10h]
-  movdqa      xmm2, [esi+20h]
-  movdqa      xmm3, [esi+30h]
-  movdqa      xmm6,xmm0
-  punpcklbw   xmm0,xmm1
-  punpckhbw   xmm6,xmm1
-  movdqa      xmm7,xmm2
-  punpcklbw   xmm2,xmm3
-  punpckhbw   xmm7,xmm3
-  movdqa      xmm4,xmm0
-  movdqa      xmm5,xmm6
-  punpcklwd   xmm0,xmm2
-  punpckhwd   xmm4,xmm2
-  punpcklwd   xmm6,xmm7
-  punpckhwd   xmm5,xmm7
-  movdqa      xmm1,xmm0
-  movdqa      xmm2,xmm4
-  punpckldq   xmm0,xmm6
-  punpckhdq   xmm1,xmm6
-  punpckldq   xmm4,xmm5
-  punpckhdq   xmm2,xmm5
-  movdqa      xmm5,xmm0
-  movdqa      xmm6,xmm1
-  punpcklqdq  xmm0,xmm4
-  punpckhqdq  xmm5,xmm4
-  punpcklqdq  xmm1,xmm2
-  punpckhqdq  xmm6,xmm2
-  mov         esi,dword [esp+14h]
-  mov         ecx,dword [ebp+10h]
-  mov         edx,dword [esp+0Ch]
-  mov         edi,dword [esp+8]
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         esi,dword [esp+18h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  movd        dword [esi],xmm0
-  movd        dword [esi+ecx],xmm5
-  movd        dword [esi+ecx*2],xmm1
-  movd        dword [esi+edx],xmm6
-  psrldq      xmm0,4
-  psrldq      xmm5,4
-  psrldq      xmm1,4
-  psrldq      xmm6,4
-  mov         edi,dword [esp+10h]
-  movd        dword [edi],xmm0
-  movd        dword [edi+ecx],xmm5
-  movd        dword [edi+ecx*2],xmm1
-  movd        dword [edi+edx],xmm6
-  pop         edi
-  pop         esi
-  mov         esp,ebp
-  pop         ebp
-  ret
-
-
-
-;*******************************************************************************
-;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN  DeblockLumaLt4V_ssse3
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-
-WELS_EXTERN  DeblockLumaEq4V_ssse3
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-%endif
-
-
-
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-    push     r3
-    push     r4
-    push     r5
-
-%assign   push_num   3
-    LOAD_3_PARA
-    PUSH_XMM 8
-
-    SIGN_EXTENSION   r1, r1d
-
-    mov      r5,    r7
-    mov      r3,    r7
-    and      r3,    0Fh
-    sub      r7,    r3
-    sub      r7,    10h
-
-    lea      r3,    [r0 + r1 * 8]
-    lea      r4,    [r1 * 3]
-
-    movq    xmm0,  [r0]
-    movq    xmm7,  [r3]
-    punpcklqdq   xmm0,  xmm7
-    movq    xmm1,  [r0 + r1]
-    movq    xmm7,  [r3 + r1]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [r0 + r1*2]
-    movq    xmm7,  [r3 + r1*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [r0 + r4]
-    movq    xmm7,  [r3 + r4]
-    punpcklqdq   xmm3,  xmm7
-
-    lea     r0,   [r0 + r1 * 4]
-    lea     r3,   [r3 + r1 * 4]
-    movq    xmm4,  [r0]
-    movq    xmm7,  [r3]
-    punpcklqdq   xmm4,  xmm7
-    movq    xmm5,  [r0 + r1]
-    movq    xmm7,  [r3 + r1]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [r0 + r1*2]
-    movq    xmm7,  [r3 + r1*2]
-    punpcklqdq   xmm6,  xmm7
-
-    movdqa  [r7],   xmm0
-    movq    xmm7,  [r0 + r4]
-    movq    xmm0,  [r3 + r4]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [r7]
-
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
-    movdqa  [r2],    xmm4
-    movdqa  [r2 + 10h],  xmm2
-    movdqa  [r2 + 20h],  xmm3
-    movdqa  [r2 + 30h],  xmm7
-    movdqa  [r2 + 40h],  xmm5
-    movdqa  [r2 + 50h],  xmm1
-    movdqa  [r2 + 60h],  xmm6
-    movdqa  [r2 + 70h],  xmm0
-
-    mov     r7,   r5
-    POP_XMM
-    pop     r5
-    pop     r4
-    pop     r3
-    ret
-
-
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-    push     r3
-    push     r4
-
-%assign  push_num 2
-    LOAD_3_PARA
-    PUSH_XMM 8
-
-    SIGN_EXTENSION   r1, r1d
-
-    mov      r4,    r7
-    mov      r3,    r7
-    and      r3,    0Fh
-    sub      r7,    r3
-    sub      r7,    10h
-
-    movdqa   xmm0,   [r2]
-    movdqa   xmm1,   [r2 + 10h]
-    movdqa   xmm2,   [r2 + 20h]
-    movdqa   xmm3,   [r2 + 30h]
-    movdqa   xmm4,   [r2 + 40h]
-    movdqa   xmm5,   [r2 + 50h]
-    movdqa   xmm6,   [r2 + 60h]
-    movdqa   xmm7,   [r2 + 70h]
-
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
-    lea      r2,   [r1 * 3]
-
-    movq     [r0],  xmm4
-    movq     [r0 + r1],  xmm2
-    movq     [r0 + r1*2],  xmm3
-    movq     [r0 + r2],  xmm7
-
-    lea      r0,   [r0 + r1*4]
-    movq     [r0],  xmm5
-    movq     [r0 + r1],  xmm1
-    movq     [r0 + r1*2],  xmm6
-    movq     [r0 + r2],  xmm0
-
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-
-    lea       r0,  [r0 + r1*4]
-    movq     [r0],  xmm4
-    movq     [r0 + r1],  xmm2
-    movq     [r0 + r1*2],  xmm3
-    movq     [r0 + r2],  xmm7
-
-    lea      r0,   [r0 + r1*4]
-    movq     [r0],  xmm5
-    movq     [r0 + r1],  xmm1
-    movq     [r0 + r1*2],  xmm6
-    movq     [r0 + r2],  xmm0
-
-
-    mov      r7,   r4
-    POP_XMM
-    pop      r4
-    pop      r3
-    ret
-
--- a/codec/common/deblocking_common.cpp
+++ /dev/null
@@ -1,204 +1,0 @@
-#include "deblocking_common.h"
-#include "macros.h"
-//  C code only
-void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
-                         int8_t* pTc) {
-  for (int32_t i = 0; i < 16; i++) {
-    int32_t iTc0 = pTc[i >> 2];
-    if (iTc0 >= 0) {
-      int32_t p0 = pPix[-iStrideX];
-      int32_t p1 = pPix[-2 * iStrideX];
-      int32_t p2 = pPix[-3 * iStrideX];
-      int32_t q0 = pPix[0];
-      int32_t q1 = pPix[iStrideX];
-      int32_t q2 = pPix[2 * iStrideX];
-      bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
-      bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
-      bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-      int32_t iTc = iTc0;
-      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
-        bool bDetaP2P0 =  WELS_ABS (p2 - p0) < iBeta;
-        bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
-        if (bDetaP2P0) {
-          pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1, -iTc0, iTc0);
-          iTc++;
-        }
-        if (bDetaQ2Q0) {
-          pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1, -iTc0, iTc0);
-          iTc++;
-        }
-        int32_t iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc);
-        pPix[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
-        pPix[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
-      }
-    }
-    pPix += iStrideY;
-  }
-}
-void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
-  int32_t p0, p1, p2, q0, q1, q2;
-  int32_t iDetaP0Q0;
-  bool bDetaP1P0, bDetaQ1Q0;
-  for (int32_t i = 0; i < 16; i++) {
-    p0 = pPix[-iStrideX];
-    p1 = pPix[-2 * iStrideX];
-    p2 = pPix[-3 * iStrideX];
-    q0 = pPix[0];
-    q1 = pPix[iStrideX];
-    q2 = pPix[2 * iStrideX];
-    iDetaP0Q0 = WELS_ABS (p0 - q0);
-    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
-    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-    if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
-      if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
-        bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
-        bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
-        if (bDetaP2P0) {
-          const int32_t p3 = pPix[-4 * iStrideX];
-          pPix[-iStrideX] = (p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4) >> 3;	   //p0
-          pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2;	 //p1
-          pPix[-3 * iStrideX] = ((p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4) >> 3;//p2
-        } else {
-          pPix[-1 * iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;	//p0
-        }
-        if (bDetaQ2Q0) {
-          const int32_t q3 = pPix[3 * iStrideX];
-          pPix[0] = (p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4) >> 3;   //q0
-          pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2;   //q1
-          pPix[2 * iStrideX] = ((q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4) >> 3;//q2
-        } else {
-          pPix[0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
-        }
-      } else {
-        pPix[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;   //p0
-        pPix[ 0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
-      }
-    }
-    pPix += iStrideY;
-  }
-}
-void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
-  DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
-}
-void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
-  DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
-}
-void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
-  DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
-}
-void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
-  DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
-}
-void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
-                           int32_t iBeta, int8_t* pTc) {
-  int32_t p0, p1, q0, q1, iDeta;
-  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
-
-  for (int32_t i = 0; i < 8; i++) {
-    int32_t iTc0 = pTc[i >> 1];
-    if (iTc0 > 0) {
-      p0 = pPixCb[-iStrideX];
-      p1 = pPixCb[-2 * iStrideX];
-      q0 = pPixCb[0];
-      q1 = pPixCb[iStrideX];
-
-      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
-      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
-      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
-        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
-        pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
-        pPixCb[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
-      }
-
-
-      p0 = pPixCr[-iStrideX];
-      p1 = pPixCr[-2 * iStrideX];
-      q0 = pPixCr[0];
-      q1 = pPixCr[iStrideX];
-
-      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
-      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
-      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-
-      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
-        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
-        pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
-        pPixCr[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
-      }
-    }
-    pPixCb += iStrideY;
-    pPixCr += iStrideY;
-  }
-}
-void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
-                           int32_t iBeta) {
-  int32_t p0, p1, q0, q1;
-  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
-  for (int32_t i = 0; i < 8; i++) {
-    //cb
-    p0 = pPixCb[-iStrideX];
-    p1 = pPixCb[-2 * iStrideX];
-    q0 = pPixCb[0];
-    q1 = pPixCb[iStrideX];
-    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
-    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
-    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
-      pPixCb[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
-      pPixCb[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
-    }
-
-    //cr
-    p0 = pPixCr[-iStrideX];
-    p1 = pPixCr[-2 * iStrideX];
-    q0 = pPixCr[0];
-    q1 = pPixCr[iStrideX];
-    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
-    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
-    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
-    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
-      pPixCr[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
-      pPixCr[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
-    }
-    pPixCr += iStrideY;
-    pPixCb += iStrideY;
-  }
-}
-void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
-                            int8_t* tc) {
-  DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
-}
-void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
-                            int8_t* tc) {
-  DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
-}
-void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
-  DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
-}
-void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
-  DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
-}
-
-#ifdef X86_ASM
-extern "C" {
-  void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
-    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
-
-    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
-    DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
-    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
-  }
-
-  void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
-    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
-
-    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
-    DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
-    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
-  }
-
-}
-
-#endif
-
--- a/codec/common/deblocking_common.h
+++ /dev/null
@@ -1,55 +1,0 @@
-#ifndef WELS_DEBLOCKING_COMMON_H__
-#define WELS_DEBLOCKING_COMMON_H__
-#include "typedefs.h"
-void DeblockLumaLt4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockLumaLt4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
-                            int8_t* pTc);
-void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
-                            int8_t* pTc);
-void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#ifdef  X86_ASM
-void DeblockLumaLt4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
-void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
-void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaEq4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
-                             int8_t* pTC);
-void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
-                             int8_t* pTC);
-#endif
-
-#if defined(HAVE_NEON)
-void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
-void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
-void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-
-void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
-void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
-#endif
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif //WELS_DEBLOCKING_COMMON_H__
--- a/codec/common/deblocking_neon.S
+++ /dev/null
@@ -1,1052 +1,0 @@
-/*!
-* \copy
-*     Copyright (c)  2013, Cisco Systems
-*     All rights reserved.
-
-*     Redistribution and use in source and binary forms, with or without
-*     modification, are permitted provided that the following conditions
-*     are met:
-
-*        * Redistributions of source code must retain the above copyright
-*          notice, this list of conditions and the following disclaimer.
-
-*        * Redistributions in binary form must reproduce the above copyright
-*          notice, this list of conditions and the following disclaimer in
-*          the documentation and/or other materials provided with the
-*          distribution.
-
-*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-*     POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifdef HAVE_NEON
-.text
-
-#include "arm_arch_common_macro.S"
-
-#ifdef __APPLE__
-.macro	JMP_IF_128BITS_IS_ZERO
-    vorr.s16	$2, $0, $1
-    vmov		r3, r2, $2
-    orr			r3, r3, r2
-    cmp			r3, #0
-.endm
-
-.macro	MASK_MATRIX
-    vabd.u8	$6, $1, $2
-    vcgt.u8	$6, $4, $6
-
-    vabd.u8	$4, $0, $1
-    vclt.u8	$4, $4, $5
-    vand.u8	$6, $6, $4
-
-    vabd.u8	$4, $3, $2
-    vclt.u8	$4, $4, $5
-    vand.u8	$6, $6, $4
-.endm
-
-
-.macro	DIFF_LUMA_LT4_P1_Q1
-    vabd.u8	$9, $0, $2
-    vclt.u8	$9, $9, $4
-    vrhadd.u8	$8, $2, $3
-    vhadd.u8	$8, $0, $8
-    vsub.s8	$8, $8, $1
-    vmax.s8	$8, $8, $5
-    vmin.s8	$8, $8, $6
-    vand.s8	$8, $8, $9
-    vand.s8	$8, $8, $7
-    vadd.u8	$8, $1, $8
-    vabs.s8	$9, $9
-.endm
-
-.macro	DIFF_LUMA_LT4_P0_Q0
-    vsubl.u8	$5, $0, $3
-    vsubl.u8	$6, $2, $1
-    vshl.s16	$6, $6, #2
-    vadd.s16	$5, $5, $6
-    vrshrn.s16		$4, $5, #3
-.endm
-
-.macro	DIFF_LUMA_EQ4_P2P1P0
-    vaddl.u8	q4, $1, $2
-    vaddl.u8	q5, $3, $4
-    vadd.u16	q5, q4, q5
-
-    vaddl.u8	q4, $0, $1
-    vshl.u16	q4, q4, #1
-    vadd.u16	q4, q5, q4
-
-    vrshrn.u16		$0, q5, #2
-    vrshrn.u16		$7, q4, #3
-
-    vshl.u16	q5, q5, #1
-    vsubl.u8	q4, $5, $1
-    vadd.u16	q5, q4,q5
-
-    vaddl.u8	q4, $2, $5
-    vaddw.u8	q4, q4, $2
-    vaddw.u8	q4, q4, $3
-
-    vrshrn.u16		d10,q5, #3
-    vrshrn.u16		d8, q4, #2
-    vbsl.u8		$6, d10, d8
-.endm
-
-.macro	DIFF_LUMA_EQ4_MASK
-    vmov	$3, $2
-    vbsl.u8	$3, $0, $1
-.endm
-
-.macro	DIFF_CHROMA_EQ4_P0Q0
-    vaddl.u8	$4, $0, $3
-    vaddw.u8	$5, $4, $1
-    vaddw.u8	$6, $4, $2
-    vaddw.u8	$5, $5, $0
-
-    vaddw.u8	$6, $6, $3
-    vrshrn.u16		$7, $5, #2
-    vrshrn.u16		$8, $6, #2
-.endm
-
-.macro	LOAD_CHROMA_DATA_4
-    vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro	STORE_CHROMA_DATA_4
-    vst4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
-    vst4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
-.endm
-
-.macro	LOAD_LUMA_DATA_3
-    vld3.u8	{$0[$6],$1[$6],$2[$6]}, [r2], r1
-    vld3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro	STORE_LUMA_DATA_4
-    vst4.u8	{$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
-    vst4.u8	{$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
-.endm
-
-.macro	LOAD_LUMA_DATA_4
-    vld4.u8	{$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1
-    vld4.u8	{$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1
-.endm
-
-.macro	STORE_LUMA_DATA_3
-    vst3.u8	{$0[$6],$1[$6],$2[$6]}, [r3], r1
-    vst3.u8	{$3[$6],$4[$6],$5[$6]}, [r0], r1
-.endm
-
-.macro	EXTRACT_DELTA_INTO_TWO_PART
-    vcge.s8	$1, $0, #0
-    vand	$1, $0, $1
-    vsub.s8	$0, $1, $0
-.endm
-#else
-.macro	JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
-    vorr.s16	\arg2, \arg0, \arg1
-    vmov		r3, r2, \arg2
-    orr			r3, r3, r2
-    cmp			r3, #0
-.endm
-
-.macro	MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vabd.u8	\arg6, \arg1, \arg2
-    vcgt.u8	\arg6, \arg4, \arg6
-
-    vabd.u8	\arg4, \arg0, \arg1
-    vclt.u8	\arg4, \arg4, \arg5
-    vand.u8	\arg6, \arg6, \arg4
-
-    vabd.u8	\arg4, \arg3, \arg2
-    vclt.u8	\arg4, \arg4, \arg5
-    vand.u8	\arg6, \arg6, \arg4
-.endm
-
-.macro	DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
-    vabd.u8	\arg9, \arg0, \arg2
-    vclt.u8	\arg9, \arg9, \arg4
-    vrhadd.u8	\arg8, \arg2, \arg3
-    vhadd.u8	\arg8, \arg0, \arg8
-    vsub.s8	\arg8, \arg8, \arg1
-    vmax.s8	\arg8, \arg8, \arg5
-    vmin.s8	\arg8, \arg8, \arg6
-    vand.s8	\arg8, \arg8, \arg9
-    vand.s8	\arg8, \arg8, \arg7
-    vadd.u8	\arg8, \arg1, \arg8
-    vabs.s8	\arg9, \arg9
-.endm
-
-.macro	DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vsubl.u8	\arg5, \arg0, \arg3
-    vsubl.u8	\arg6, \arg2, \arg1
-    vshl.s16	\arg6, \arg6, #2
-    vadd.s16	\arg5, \arg5, \arg6
-    vrshrn.s16		\arg4, \arg5, #3
-.endm
-
-
-.macro	DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
-    vaddl.u8	q4, \arg1, \arg2
-    vaddl.u8	q5, \arg3, \arg4
-    vadd.u16	q5, q4, q5
-
-    vaddl.u8	q4, \arg0, \arg1
-    vshl.u16	q4, q4, #1
-    vadd.u16	q4, q5, q4
-
-    vrshrn.u16		\arg0, q5, #2
-    vrshrn.u16		\arg7, q4, #3
-
-    vshl.u16	q5, q5, #1
-    vsubl.u8	q4, \arg5, \arg1
-    vadd.u16	q5, q4,q5
-
-    vaddl.u8	q4, \arg2, \arg5
-    vaddw.u8	q4, q4, \arg2
-    vaddw.u8	q4, q4, \arg3
-
-    vrshrn.u16		d10,q5, #3
-    vrshrn.u16		d8, q4, #2
-    vbsl.u8		\arg6, d10, d8
-.endm
-
-.macro	DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
-    vmov	\arg3, \arg2
-    vbsl.u8	\arg3, \arg0, \arg1
-.endm
-
-.macro	DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vaddl.u8	\arg4, \arg0, \arg3
-    vaddw.u8	\arg5, \arg4, \arg1
-    vaddw.u8	\arg6, \arg4, \arg2
-    vaddw.u8	\arg5, \arg5, \arg0
-    vaddw.u8	\arg6, \arg6, \arg3
-    vrshrn.u16		\arg7, \arg5, #2
-    vrshrn.u16		\arg8, \arg6, #2
-.endm
-
-.macro	LOAD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
-    vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
-.endm
-
-.macro	STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vst4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
-    vst4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
-.endm
-
-.macro	LOAD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vld3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
-    vld3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
-.endm
-
-.macro	STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
-    vst4.u8	{\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
-    vst4.u8	{\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
-.endm
-
-.macro	LOAD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-    vld4.u8	{\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1
-    vld4.u8	{\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1
-.endm
-
-.macro	STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vst3.u8	{\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
-    vst3.u8	{\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
-.endm
-
-.macro	EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
-    vcge.s8	\arg1, \arg0, #0
-    vand	\arg1, \arg0, \arg1
-    vsub.s8	\arg0, \arg1, \arg0
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
-    vpush	{q4-q7}
-    vdup.u8	q11, r2
-    vdup.u8	q9, r3
-
-    add			r2, r1, r1, lsl #1
-    sub			r2, r0, r2
-    vld1.u8	{q0}, [r2], r1
-    vld1.u8	{q3}, [r0], r1
-    vld1.u8	{q1}, [r2], r1
-    vld1.u8	{q4}, [r0], r1
-    vld1.u8	{q2}, [r2]
-    vld1.u8	{q5}, [r0]
-    sub			r2, r2, r1
-
-    ldr			r3, [sp, #64]
-    vld1.s8	{d31}, [r3]
-    vdup.s8	d28, d31[0]
-    vdup.s8	d30, d31[1]
-    vdup.s8	d29, d31[2]
-    vdup.s8	d31, d31[3]
-    vtrn.32	d28, d30
-    vtrn.32	d29, d31
-    vcge.s8	q10, q14, #0
-
-    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
-    vand.u8	q10, q10, q15
-
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
-
-    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
-    vst1.u8	{q6}, [r2], r1
-
-    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
-
-    vabs.s8	q12, q12
-    vabs.s8	q13, q13
-    vadd.u8	q14,q14,q12
-    vadd.u8	q14,q14,q13
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
-
-    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-    vand.s8	q8, q8, q10
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
-    vqadd.u8	q2, q2, q9
-    vqsub.u8	q2, q2, q8
-    vst1.u8	{q2}, [r2], r1
-    vqsub.u8	q3, q3, q9
-    vqadd.u8	q3, q3, q8
-    vst1.u8	{q3}, [r2]	, r1
-    vst1.u8	{q7}, [r2]
-
-    vpop	{q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
-    vpush	{q4-q7}
-
-    vdup.u8	q5, r2
-    vdup.u8	q4, r3
-
-    sub			r3, r0, r1, lsl #2
-    vld1.u8	{q8},  [r3], r1
-    vld1.u8	{q12}, [r0], r1
-    vld1.u8	{q9},  [r3], r1
-    vld1.u8	{q13}, [r0], r1
-    vld1.u8	{q10}, [r3], r1
-    vld1.u8	{q14}, [r0], r1
-    vld1.u8	{q11}, [r3]
-    vld1.u8	{q15}, [r0]
-    sub			r3, r3, r1	, lsl #1
-
-    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
-
-    mov			r2, r2, lsr #2
-    add			r2, r2, #2
-    vdup.u8	q5, r2
-    vabd.u8	q0, q11, q12
-    vclt.u8	q7, q0, q5
-
-    vabd.u8	q1, q9, q11
-    vclt.u8	q1, q1, q4
-    vand.s8	q1, q1, q7
-
-    vabd.u8	q2, q14,q12
-    vclt.u8	q2, q2, q4
-    vand.s8	q2, q2, q7
-    vand.u8	q7, q7, q6
-
-    vmov		q3, q1
-
-    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
-    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
-
-    vand.u8	q3, q7, q3
-    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q8,q10, q3, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q4
-    vst1.u8	{q4}, [r3], r1
-
-    vmov		q0, q2
-    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d6
-    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d7
-
-    vand.u8	q0, q7, q0
-    DIFF_LUMA_EQ4_MASK	q2,  q12, q6, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q15, q13, q0, q4
-    vst1.u8	{q4}, [r3], r1
-    DIFF_LUMA_EQ4_MASK	q3,  q14, q0, q4
-    vst1.u8	{q4}, [r3], r1
-
-    vpop	{q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
-    vpush	{q4-q7}
-
-    vdup.u8	q11, r2
-    vdup.u8	q9, r3
-
-    sub			r2, r0, #3
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 0
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 1
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 2
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 3
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 4
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 5
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 6
-    LOAD_LUMA_DATA_3		d0, d1, d2, d6, d7, d8, 7
-
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 0
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 1
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 2
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 3
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 4
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 5
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 6
-    LOAD_LUMA_DATA_3		d3, d4, d5, d9, d10, d11, 7
-
-    vswp		d1, d2
-    vswp		d3, d4
-    vswp		d1, d4
-    vswp		d7, d8
-    vswp		d9, d10
-    vswp		d7, d10
-
-    sub			r0, r0, r1, lsl #4
-
-    ldr			r3, [sp, #64]
-    vld1.s8	{d31}, [r3]
-    vdup.s8	d28, d31[0]
-    vdup.s8	d30, d31[1]
-    vdup.s8	d29, d31[2]
-    vdup.s8	d31, d31[3]
-    vtrn.32	d28, d30
-    vtrn.32	d29, d31
-    vcge.s8	q10, q14, #0
-
-    MASK_MATRIX	q1, q2, q3, q4, q11, q9, q15
-    vand.u8	q10, q10, q15
-
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
-
-    DIFF_LUMA_LT4_P1_Q1	q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
-    DIFF_LUMA_LT4_P1_Q1	q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
-
-    vabs.s8	q12, q12
-    vabs.s8	q13, q13
-    vadd.u8	q14,q14,q12
-    vadd.u8	q14,q14,q13
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
-
-    DIFF_LUMA_LT4_P0_Q0	d2, d4, d6, d8, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d3, d5, d7, d9, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-    vand.s8	q8, q8, q10
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q9
-    vqadd.u8	q2, q2, q9
-    vqsub.u8	q2, q2, q8
-
-    vqsub.u8	q3, q3, q9
-    vqadd.u8	q3, q3, q8
-
-    sub		r0, #2
-    add		r2, r0, r1
-    lsl		r1, #1
-
-    vmov		q1, q6
-    vmov		q4, q7
-
-    vswp		q2, q3
-    vswp		d3, d6
-    vswp		d5, d8
-
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 0, 1
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 2, 3
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 4, 5
-    STORE_LUMA_DATA_4		d2, d3, d4, d5, 6, 7
-
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 0, 1
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 2, 3
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 4, 5
-    STORE_LUMA_DATA_4		d6, d7, d8, d9, 6, 7
-
-    vpop	{q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
-    vpush	{q4-q7}
-    vdup.u8	q5, r2
-    vdup.u8	q4, r3
-
-    sub			r3, r0, #4				//	pix -= 4
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,0
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,1
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,2
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,3
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,4
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,5
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,6
-    LOAD_LUMA_DATA_4		d16,d17,d18,d19,d24,d25,d26,d27,7
-
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,0
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,1
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,2
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,3
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,4
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,5
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,6
-    LOAD_LUMA_DATA_4		d20,d21,d22,d23,d28,d29,d30,d31,7
-
-    vswp		q9, q10
-    vswp		d17,d18
-    vswp		d21,d22
-    vswp		q13,q14
-    vswp		d25,d26
-    vswp		d29,d30
-    sub			r0, r0, r1	, lsl #4
-
-    MASK_MATRIX	q10, q11, q12, q13, q5, q4, q6
-
-    mov			r2, r2, lsr #2
-    add			r2, r2, #2
-    vdup.u8	q5, r2
-    vabd.u8	q0, q11, q12
-    vclt.u8	q7, q0, q5
-
-    vabd.u8	q1, q9, q11
-    vclt.u8	q1, q1, q4
-    vand.s8	q1, q1, q7
-
-    vabd.u8	q2, q14,q12
-    vclt.u8	q2, q2, q4
-    vand.s8	q2, q2, q7
-    vand.u8	q7, q7, q6
-
-    vmov		q3, q1
-
-    DIFF_LUMA_EQ4_P2P1P0		d16, d18, d20, d22, d24, d26, d2, d0
-    DIFF_LUMA_EQ4_P2P1P0		d17, d19, d21, d23, d25, d27, d3, d1
-
-    vand.u8	q3, q7, q3
-    DIFF_LUMA_EQ4_MASK	q0, q9, q3, q4
-    vmov		q9, q4
-    vbsl.u8	q3, q8, q10
-    DIFF_LUMA_EQ4_MASK	q1,q11, q6, q8
-
-    vand.u8	q7, q7, q2
-
-    DIFF_LUMA_EQ4_P2P1P0		d30, d28, d26, d24, d22, d20, d4, d0
-    DIFF_LUMA_EQ4_P2P1P0		d31, d29, d27, d25, d23, d21, d5, d1
-
-    vbsl.u8	q6, q2, q12
-    DIFF_LUMA_EQ4_MASK	q15, q13, q7, q4
-
-    vbsl.u8	q7, q0, q14
-
-    vmov		q5, q6
-    vmov		q2, q9
-    vmov		q6, q4
-    vmov		q4, q8
-
-    vswp	d8, d6
-    vswp	d5, d7
-    vswp	d5, d8
-    vswp	d14, d12
-    vswp	d11, d13
-    vswp	d11, d14
-
-    sub		r3, r0, #3
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,0
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,1
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,2
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,3
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,4
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,5
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,6
-    STORE_LUMA_DATA_3		d4,d5,d6,d10,d11,d12,7
-
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,0
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,1
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,2
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,3
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,4
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,5
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,6
-    STORE_LUMA_DATA_3		d7,d8,d9,d13,d14,d15,7
-
-    vpop	{q4-q7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #0]
-
-    sub			r0, r0, r2	, lsl #1
-    sub			r1, r1, r2, lsl #1
-    vdup.u8	    q9, r3
-    ldr			r3, [sp, #4]
-
-    vld1.u8	{d0}, [r0], r2
-    vld1.u8	{d1}, [r1], r2
-    vld1.u8	{d2}, [r0], r2
-    vld1.u8	{d3}, [r1], r2
-    vld1.u8	{d4}, [r0], r2
-    vld1.u8	{d5}, [r1], r2
-    vld1.u8	{d6}, [r0]
-    vld1.u8	{d7}, [r1]
-
-    sub			r0, r0, r2, lsl #1
-    sub			r1, r1, r2, lsl #1
-
-    vld1.s8	{d31}, [r3]
-    vmovl.u8	q14,d31
-    vshl.u64	d29,d28,#8
-    vorr		d28,d29
-    vmov		d29, d28
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
-
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
-
-    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-
-    vand.s8	q8, q8, q10
-    vcge.s8	q14, q14, #0
-    vand.s8	q8, q8, q14
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
-    vqadd.u8	q1, q1, q10
-    vqsub.u8	q1, q1, q8
-    vst1.u8	{d2}, [r0], r2
-    vst1.u8	{d3}, [r1], r2
-    vqsub.u8	q2, q2, q10
-    vqadd.u8	q2, q2, q8
-    vst1.u8	{d4}, [r0]
-    vst1.u8	{d5}, [r1]
-
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
-    vpush	{q4-q5}
-
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #32]
-
-    sub			r0, r0, r2	, lsl #1
-    sub			r1, r1, r2, lsl #1
-    vdup.u8	q9, r3
-    vld1.u8	{d0}, [r0], r2		//	q0::p1
-    vld1.u8	{d1}, [r1], r2
-    vld1.u8	{d2}, [r0], r2		//	q1::p0
-    vld1.u8	{d3}, [r1], r2
-    vld1.u8	{d4}, [r0], r2		//	q2::q0
-    vld1.u8	{d5}, [r1], r2
-    vld1.u8	{d6}, [r0]				//	q3::q1
-    vld1.u8	{d7}, [r1]
-
-    sub			r0, r0, r2, lsl #1	//	pix = [-1*src_stride]
-    sub			r1, r1, r2, lsl #1
-
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
-
-    vmov			q11, q10
-
-    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q4, q5, q8, d30, d0		// Cb::p0' q0'
-    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q12, q13, q14, d31, d1	// Cr::p0' q0'
-
-    vbsl.u8	q10, q15, q1
-    vst1.u8	{d20}, [r0], r2
-    vst1.u8	{d21}, [r1], r2
-
-    vbsl.u8	q11, q0, q2
-    vst1.u8	{d22}, [r0]
-    vst1.u8	{d23}, [r1]
-
-    vpop	{q4-q5}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
-
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #0]
-
-    sub			r0, r0, #2
-    vdup.u8	q9, r3
-    ldr			r3, [sp, #4]
-    sub			r1, r1, #2
-    vld1.s8	{d31}, [r3]
-
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-    vswp		q1, q2
-    vswp		d1, d2
-    vswp		d6, d5
-
-    vmovl.u8	q14, d31
-    vshl.u64	d29,d28,#8
-    vorr		d28,d29
-    vmov		d29, d28
-    veor		q15, q15
-    vsub.i8	q15,q15,q14
-
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
-
-    DIFF_LUMA_LT4_P0_Q0	d0, d2, d4, d6, d16, q12, q13
-    DIFF_LUMA_LT4_P0_Q0	d1, d3, d5, d7, d17, q12, q13
-    vmax.s8	q8, q8, q15
-    vmin.s8	q8, q8, q14
-
-    vand.s8	q8, q8, q10
-    vcge.s8	q14, q14, #0
-    vand.s8	q8, q8, q14
-    EXTRACT_DELTA_INTO_TWO_PART	q8, q10
-    vqadd.u8	q1, q1, q10
-    vqsub.u8	q1, q1, q8
-    vqsub.u8	q2, q2, q10
-    vqadd.u8	q2, q2, q8
-
-    sub			r0, r0, r2, lsl #3
-    sub			r1, r1, r2, lsl #3
-    vswp		d1, d2
-    vswp		d6, d5
-    vswp		q1, q2
-
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
-    vpush	{q4-q5}
-    vdup.u8	q11, r3
-    ldr			r3, [sp, #32]
-
-    sub			r0, r0, #2
-    sub			r1, r1, #2
-
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    LOAD_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-    vswp		q1, q2
-    vswp		d1, d2
-    vswp		d6, d5
-
-    vdup.u8	q9, r3
-    MASK_MATRIX	q0, q1, q2, q3, q11, q9, q10
-    vmov			q11, q10
-
-    DIFF_CHROMA_EQ4_P0Q0		d0, d2, d4, d6, q8, q9, q12, d8, d10
-    DIFF_CHROMA_EQ4_P0Q0		d1, d3, d5, d7, q13, q14, q15, d9, d11
-
-    vbsl.u8	q10, q4, q1
-    vbsl.u8	q11, q5, q2
-    sub			r0, r0, r2, lsl #3	//	pix: 0th row	[-2]
-    sub			r1, r1, r2, lsl #3
-
-    vmov		q1, q10
-    vmov		q2, q11
-    vswp		d1, d2
-    vswp		d6, d5
-    vswp		q1, q2
-    //	Cb:d0d1d2d3, Cr:d4d5d6d7
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 0
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 1
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 2
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 3
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 4
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 5
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 6
-    STORE_CHROMA_DATA_4	d0, d1, d2, d3, d4, d5, d6, d7, 7
-
-    vpop	{q4-q5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN WelsNonZeroCount_neon
-
-    vld1.64	{d0-d2}, [r0]
-
-    vceq.s8	q0, q0, #0
-    vceq.s8	d2, d2, #0
-    vmvn	q0, q0
-    vmvn	d2, d2
-    vabs.s8	q0, q0
-    vabs.s8	d2, d2
-
-    vst1.64	{d0-d2}, [r0]
-WELS_ASM_FUNC_END
-
-#ifdef __APPLE__
-.macro BS_NZC_CHECK
-    vld1.8   {d0,d1}, [$0]
-    /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_nzc_check_jump0
-
-    sub      r6, $0, $2, lsl #4
-	sub      r6, $2, lsl #3
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-
-bs_nzc_check_jump0:
-    vext.8   q1, q1, q0, #12
-	vadd.u8  $3, q0, q1
-
-
-    /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_nzc_check_jump1
-
-    sub      r6, $0, #21
-	add      r7, r6, #4
-    vld1.8   d3[4], [r6]
-	add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-	add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-
-bs_nzc_check_jump1:
-	vzip.8   d0, d1
-	vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-	vadd.u8  $4, q0, q1
-.endm
-
-.macro BS_COMPARE_MV //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
-    mov       r6, #4
-    vabd.s16  q8, $0, $1
-    vabd.s16  q9, $1, $2
-	vdup.s16  $0, r6
-    vabd.s16  q10, $2, $3
-    vabd.s16  q11, $3, $4
-
-    vcge.s16  q8, $0
-    vcge.s16  q9, $0
-    vcge.s16  q10, $0
-    vcge.s16  q11, $0
-
-	vpadd.i16 d16, d16, d17
-    vpadd.i16 d17, d18, d19
-    vpadd.i16 d18, d20, d21
-    vpadd.i16 d19, d22, d23
-
-    vaddhn.i16  $5, q8, q8
-    vaddhn.i16  $6, q9, q9
-.endm
-
-.macro BS_MV_CHECK
-    vldm   $0, {q0,q1,q2,q3}
-
-    /* Arrenge the input data --- TOP */
-	ands     r6, $1, #2
-	beq      bs_mv_check_jump0
-
-    sub      r6, $0, $2, lsl #6
-    add      r6, #48
-    vld1.8   {d8, d9}, [r6]
-
-bs_mv_check_jump0:
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, $3, $4
-
-    /* Arrenge the input data --- LEFT */
-	ands     r6, $1, #1
-	beq      bs_mv_check_jump1
-
-    sub      r6, $0, #52
-    add      r7, r6, #16
-	vld1.32   d8[0], [r6]
-	add      r6, r7, #16
-    vld1.32   d8[1], [r7]
-	add      r7, r6, #16
-    vld1.32   d9[0], [r6]
-    vld1.32   d9[1], [r7]
-
-bs_mv_check_jump1:
-	vzip.32   q0, q2
-	vzip.32   q1, q3
-	vzip.32   q0, q1
-    vzip.32   q2, q3
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, $5, $6
-.endm
-#else
-.macro BS_NZC_CHECK  arg0, arg1, arg2, arg3, arg4
-    vld1.8   {d0,d1}, [\arg0]
-    /* Arrenge the input data --- TOP */
-    ands     r6, \arg1, #2
-    beq      bs_nzc_check_jump0
-
-    sub      r6, \arg0, \arg2, lsl #4
-    sub      r6, r6, \arg2, lsl #3
-    add      r6, #12
-    vld1.32  d3[1], [r6]
-
-bs_nzc_check_jump0:
-    vext.8   q1, q1, q0, #12
-    vadd.u8  \arg3, q0, q1
-
-
-    /* Arrenge the input data --- LEFT */
-    ands     r6, \arg1, #1
-    beq      bs_nzc_check_jump1
-
-    sub      r6, \arg0, #21
-    add      r7, r6, #4
-    vld1.8   d3[4], [r6]
-    add      r6, r7, #4
-    vld1.8   d3[5], [r7]
-    add      r7, r6, #4
-    vld1.8   d3[6], [r6]
-    vld1.8   d3[7], [r7]
-
-bs_nzc_check_jump1:
-    vzip.8   d0, d1
-    vzip.8   d0, d1
-    vext.8   q1, q1, q0, #12
-    vadd.u8  \arg4, q0, q1
-.endm
-
-.macro BS_COMPARE_MV  arg0, arg1, arg2, arg3, arg4, arg5, arg6 //in: $0,$1(const),$2(const),$3(const),$4(const); out:$5, $6
-    mov       r6, #4
-    vabd.s16  q8, \arg0, \arg1
-    vabd.s16  q9, \arg1, \arg2
-    vdup.s16  \arg0, r6
-    vabd.s16  q10, \arg2, \arg3
-    vabd.s16  q11, \arg3, \arg4
-
-    vcge.s16  q8, \arg0
-    vcge.s16  q9, \arg0
-    vcge.s16  q10, \arg0
-    vcge.s16  q11, \arg0
-
-    vpadd.i16 d16, d16, d17
-    vpadd.i16 d17, d18, d19
-    vpadd.i16 d18, d20, d21
-    vpadd.i16 d19, d22, d23
-
-    vaddhn.i16  \arg5, q8, q8
-    vaddhn.i16  \arg6, q9, q9
-.endm
-
-.macro BS_MV_CHECK  arg0, arg1, arg2, arg3, arg4, arg5, arg6
-    vldm   \arg0, {q0,q1,q2,q3}
-
-    /* Arrenge the input data --- TOP */
-    ands     r6, \arg1, #2
-    beq      bs_mv_check_jump0
-
-    sub      r6, \arg0, \arg2, lsl #6
-    add      r6, #48
-    vld1.8   {d8, d9}, [r6]
-
-bs_mv_check_jump0:
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg3, \arg4
-
-    /* Arrenge the input data --- LEFT */
-    ands     r6, \arg1, #1
-    beq      bs_mv_check_jump1
-
-    sub      r6, \arg0, #52
-    add      r7, r6, #16
-    vld1.32   d8[0], [r6]
-    add      r6, r7, #16
-    vld1.32   d8[1], [r7]
-    add      r7, r6, #16
-    vld1.32   d9[0], [r6]
-    vld1.32   d9[1], [r7]
-
-bs_mv_check_jump1:
-    vzip.32   q0, q2
-    vzip.32   q1, q3
-    vzip.32   q0, q1
-    vzip.32   q2, q3
-    BS_COMPARE_MV  q4, q0, q1, q2, q3, \arg5, \arg6
-.endm
-#endif
-
-
-WELS_ASM_FUNC_BEGIN DeblockingBSCalcEnc_neon
-
-	stmdb sp!, {r5-r7}
-	vpush {q4}
-
-	ldr  r5, [sp, #28]	//Save BS to r5
-
-	/* Checking the nzc status */
-	BS_NZC_CHECK r0, r2, r3, q14, q15 //q14,q15 save the nzc status
-
-	/* For checking bS[I] = 2 */
-	mov      r6, #2
-	vcgt.s8  q14, q14, #0
-	vdup.u8  q0, r6
-	vcgt.s8  q15, q15, #0
-
-	vand.u8  q14, q14, q0 //q14 save the nzc check result all the time --- for dir is top
-	vand.u8  q15, q15, q0 //q15 save the nzc check result all the time --- for dir is left
-
-	/* Checking the mv status*/
-	BS_MV_CHECK r1, r2, r3, d24, d25, d26, d27//q12, q13 save the mv status
-
-	/* For checking bS[I] = 1 */
-    mov      r6, #1
-	vdup.u8  q0, r6
-
-	vand.u8  q12, q12, q0 //q12 save the nzc check result all the time --- for dir is top
-	vand.u8  q13, q13, q0 //q13 save the nzc check result all the time --- for dir is left
-
-
-	/* Check bS[I] is '1' or '2' */
-	vmax.u8 q1, q12, q14
-	vmax.u8 q0, q13, q15
-
-	//vstm r5, {q0, q1}
-    vst1.32 {q0, q1}, [r5]
-	vpop {q4}
-	ldmia sp!, {r5-r7}
-WELS_ASM_FUNC_END
-#endif
--- a/codec/common/expand_picture.asm
+++ /dev/null
@@ -1,728 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  expand_picture.asm
-;*
-;*  Abstract
-;*      mmxext/sse for expand_frame
-;*
-;*  History
-;*      09/25/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-
-SECTION .text
-
-
-;;;;;;;expanding result;;;;;;;
-
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;----------------------------
-;aaaa|attttttttttttttttb|bbbb
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;llll|l                r|rrrr
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;----------------------------
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-
-%macro mov_line_8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+2*%2]
-	movq [%1], %3
-	movq [%1+%2], %3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
-	movdq%4 [%1], %3 		; top(bottom)_0
-	movdq%4 [%1+%2], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdq%4 [%1], %3 		; top(bottom)_2
-	movdq%4 [%1+%2], %3		; top(bottom)_3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
-	movdqa [%1], %3 		; top(bottom)_0
-	movdqa [%1+16], %3 		; top(bottom)_0
-	movdqa [%1+%2], %3		; top(bottom)_1
-	movdqa [%1+%2+16], %3		; top(bottom)_1
-	lea %1, [%1+2*%2]
-	movdqa [%1], %3 		; top(bottom)_2
-	movdqa [%1+16], %3 		; top(bottom)_2
-	movdqa [%1+%2], %3		; top(bottom)_3
-	movdqa [%1+%2+16], %3		; top(bottom)_3
-	lea %1, [%1+%2]
-%endmacro
-
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
-    ;r2 [width/16(8)]
-    ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
-    ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
-
-%if %1 == 32		; for luma
-	sar r2, 04h 	; width / 16(8) pixels
-.top_bottom_loops:
-	; top
-	movdqa xmm0, [r0]		; first line of picture pData
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_end16x4_sse2 r5, r1, xmm0, a
-
-	; bottom
-	movdqa xmm1, [r3] 		; last line of picture pData
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_end16x4_sse2 r4, r1, xmm1, a
-
-	lea r0, [r0+16]		; top pSrc
-	lea r5, [r5+16]		; top dst
-	lea r3, [r3+16]		; bottom pSrc
-	lea r4, [r4+16]		; bottom dst
-	neg r1 			; positive/negative stride need for next loop?
-
-	dec r2
-	jnz near .top_bottom_loops
-%elif %1 == 16	; for chroma ??
-	mov r6, r2
-	sar r2, 04h 	; (width / 16) pixels
-.top_bottom_loops:
-	; top
-	movdqa xmm0, [r0]		; first line of picture pData
-	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_16x4_sse2 r5, r1, xmm0, a
-	mov_line_end16x4_sse2 r5, r1, xmm0, a
-
-	; bottom
-	movdqa xmm1, [r3] 		; last line of picture pData
-	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_16x4_sse2 r4, r1, xmm1, a
-	mov_line_end16x4_sse2 r4, r1, xmm1, a
-
-	lea r0, [r0+16]		; top pSrc
-	lea r5, [r5+16]		; top dst
-	lea r3, [r3+16]		; bottom pSrc
-	lea r4, [r4+16]		; bottom dst
-	neg r1 			; positive/negative stride need for next loop?
-
-	dec r2
-	jnz near .top_bottom_loops
-
-	; for remaining 8 bytes
-	and r6, 0fh		; any 8 bytes left?
-	test r6, r6
-	jz near .to_be_continued	; no left to exit here
-
-	; top
-	movq mm0, [r0]		; remained 8 byte
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	mov_line_end8x4_mmx r5, r1, mm0	; dst, stride, mm?
-	; bottom
-	movq mm1, [r3]
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	mov_line_end8x4_mmx r4, r1, mm1	; dst, stride, mm?
-	WELSEMMS
-
-.to_be_continued:
-%endif
-%endmacro
-
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-    ;r6 [height]
-    ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
-    ;r3 [pSrc+(w-1)] r4[pSrc+w]
-
-%if %1 == 32		; for luma
-.left_right_loops:
-	; left
-	movzx r2d, byte [r0]		; pixel pData for left border
-	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r5], xmm0
-	movdqa [r5+16], xmm0
-
-	; right
-	movzx r2d, byte [r3]
-	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r4], xmm1
-	movdqa [r4+16], xmm1
-
-	lea r0, [r0+r1]		; left pSrc
-	lea r5, [r5+r1]		; left dst
-	lea r3, [r3+r1]		; right pSrc
-	lea r4, [r4+r1]		; right dst
-
-	dec r6
-	jnz near .left_right_loops
-%elif %1 == 16	; for chroma ??
-.left_right_loops:
-	; left
-	movzx r2d, byte [r0]		; pixel pData for left border
-	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [r5], xmm0
-
-	; right
-	movzx r2d, byte [r3]
-	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdq%2 [r4], xmm1								; might not be aligned 16 bytes in case chroma planes
-
-	lea r0, [r0+r1]		; left pSrc
-	lea r5, [r5+r1]		; left dst
-	lea r3, [r3+r1]		; right pSrc
-	lea r4, [r4+r1]		; right dst
-
-	dec r6
-	jnz near .left_right_loops
-%endif
-%endmacro
-
-%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
-	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
-	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
-    ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
-%if %1 == 32		; luma
-	; TL
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
-
-	; TR
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
-
-	; BL
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
-
-	; BR
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-	mov_line_end32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
-%elif %1 == 16	; chroma
-	; TL
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
-
-	; TR
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2 r4, r1, xmm4, %2	; dst, stride, xmm?
-
-	; BL
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
-
-	; BR
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-	mov_line_end16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
-%endif
-%endmacro
-
-;***********************************************************************----------------
-; void ExpandPictureLuma_sse2(	uint8_t *pDst,
-;									const int32_t iStride,
-;									const int32_t iWidth,
-;									const int32_t iHeight	);
-;***********************************************************************----------------
-WELS_EXTERN ExpandPictureLuma_sse2
-
-    push r4
-    push r5
-    push r6
-
-    %assign push_num 3
-    LOAD_4_PARA
-    PUSH_XMM 7
-
-    SIGN_EXTENSION r1, r1d
-    SIGN_EXTENSION r2, r2d
-    SIGN_EXTENSION r3, r3d
-
-    ;also prepare for cross border pData top-left:xmm3
-
-    movzx r6d,byte[r0]
-    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
-
-    neg r1
-    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
-    neg r1
-
-    push r3
-
-
-    dec r3                      ;h-1
-    imul r3,r1                  ;(h-1)*stride
-    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-
-    mov r6,r1                    ;r6 = stride
-    sal r6,05h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom
-
-    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
-    movzx r6d,byte [r3]             ;bottom-left
-    SSE2_Copy16Times xmm5,r6d
-
-    lea r6,[r3+r2-1]
-    movzx r6d,byte [r6]
-    SSE2_Copy16Times xmm6,r6d ;bottom-right
-
-    neg r1  ;r1 = -stride
-
-    push r0
-    push r1
-    push r2
-
-    exp_top_bottom_sse2 32
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    pop r2
-    pop r1
-    pop r0
-
-    lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
-
-    lea r3,[r0+r2-1]                        ;right border src
-    lea r4,[r3+1]                           ;right border dst
-
-    ;prepare for cross border data: top-rigth with xmm4
-     movzx r6d,byte [r3]                         ;top -rigth
-     SSE2_Copy16Times xmm4,r6d
-
-    neg r1   ;r1 = stride
-
-
-    pop r6  ;  r6 = height
-
-
-
-    push r0
-    push r1
-    push r2
-    push r6
-
-    exp_left_right_sse2  32,a
-
-    pop r6
-    pop r2
-    pop r1
-    pop r0
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
-    neg r1  ;r1 = -stride
-    lea r3,[r0-32]
-    lea r3,[r3+r1]    ;last line of top-left border
-
-    lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride
-
-
-    neg r1  ;r1 = stride
-    add r6,32         ;height +32(16) ,luma = 32, chroma = 16
-    imul r6,r1
-
-    lea r5,[r3+r6]    ;last line of bottom-left border
-    lea r6,[r4+r6]    ;last line of botoom-right border
-
-    neg r1 ; r1 = -stride
-
-    ; for left & right border expanding
-    exp_cross_sse2 32,a
-
-    POP_XMM
-    LOAD_4_PARA_POP
-
-    pop r6
-    pop r5
-    pop r4
-
-    %assign push_num 0
-
-
-	ret
-
-;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
-;***********************************************************************----------------
-WELS_EXTERN ExpandPictureChromaAlign_sse2
-
-    push r4
-    push r5
-    push r6
-
-    %assign push_num 3
-    LOAD_4_PARA
-    PUSH_XMM 7
-
-    SIGN_EXTENSION r1,r1d
-    SIGN_EXTENSION r2,r2d
-    SIGN_EXTENSION r3,r3d
-
-    ;also prepare for cross border pData top-left:xmm3
-
-    movzx r6d,byte [r0]
-    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
-
-    neg r1
-    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
-    neg r1
-
-    push r3
-
-
-    dec r3                      ;h-1
-    imul r3,r1                  ;(h-1)*stride
-    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-
-    mov r6,r1                    ;r6 = stride
-    sal r6,04h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom
-
-    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
-    movzx r6d,byte [r3]             ;bottom-left
-    SSE2_Copy16Times xmm5,r6d
-
-    lea r6,[r3+r2-1]
-    movzx r6d,byte [r6]
-    SSE2_Copy16Times xmm6,r6d ;bottom-right
-
-    neg r1  ;r1 = -stride
-
-    push r0
-    push r1
-    push r2
-
-    exp_top_bottom_sse2 16
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    pop r2
-    pop r1
-    pop r0
-
-    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
-
-    lea r3,[r0+r2-1]                        ;right border src
-    lea r4,[r3+1]                           ;right border dst
-
-    ;prepare for cross border data: top-rigth with xmm4
-    movzx r6d,byte [r3]                         ;top -rigth
-    SSE2_Copy16Times xmm4,r6d
-
-    neg r1   ;r1 = stride
-
-
-    pop r6  ;  r6 = height
-
-
-
-    push r0
-    push r1
-    push r2
-	push r6
-    exp_left_right_sse2 16,a
-
-    pop r6
-    pop r2
-    pop r1
-    pop r0
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
-    neg r1  ;r1 = -stride
-    lea r3,[r0-16]
-    lea r3,[r3+r1]    ;last line of top-left border
-
-    lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride
-
-
-    neg r1  ;r1 = stride
-    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
-    imul r6,r1
-
-    lea r5,[r3+r6]    ;last line of bottom-left border
-    lea r6,[r4+r6]    ;last line of botoom-right border
-
-    neg r1 ; r1 = -stride
-
-    ; for left & right border expanding
-    exp_cross_sse2 16,a
-
-    POP_XMM
-    LOAD_4_PARA_POP
-
-    pop r6
-    pop r5
-    pop r4
-
-    %assign push_num 0
-
-
-	ret
-
-;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
-;										const int32_t iStride,
-;										const int32_t iWidth,
-;										const int32_t iHeight	);
-;***********************************************************************----------------
-WELS_EXTERN ExpandPictureChromaUnalign_sse2
-	push r4
-    push r5
-    push r6
-
-    %assign push_num 3
-    LOAD_4_PARA
-    PUSH_XMM 7
-
-    SIGN_EXTENSION r1,r1d
-    SIGN_EXTENSION r2,r2d
-    SIGN_EXTENSION r3,r3d
-
-    ;also prepare for cross border pData top-left:xmm3
-
-    movzx r6d,byte [r0]
-    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
-
-    neg r1
-    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
-    neg r1
-
-    push r3
-
-
-    dec r3                      ;h-1
-    imul r3,r1                  ;(h-1)*stride
-    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-
-    mov r6,r1                    ;r6 = stride
-    sal r6,04h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom
-
-    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-
-    movzx r6d,byte [r3]             ;bottom-left
-    SSE2_Copy16Times xmm5,r6d
-
-    lea r6,[r3+r2-1]
-    movzx r6d,byte [r6]
-    SSE2_Copy16Times xmm6,r6d ;bottom-right
-
-    neg r1  ;r1 = -stride
-
-    push r0
-    push r1
-    push r2
-
-    exp_top_bottom_sse2 16
-
-	; for both left and right border
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-    pop r2
-    pop r1
-    pop r0
-
-    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
-
-    lea r3,[r0+r2-1]                        ;right border src
-    lea r4,[r3+1]                           ;right border dst
-
-    ;prepare for cross border data: top-rigth with xmm4
-    movzx r6d,byte [r3]                         ;top -rigth
-    SSE2_Copy16Times xmm4,r6d
-
-    neg r1   ;r1 = stride
-
-
-    pop r6  ;  r6 = height
-
-
-
-    push r0
-    push r1
-    push r2
-	push r6
-    exp_left_right_sse2 16,u
-
-    pop r6
-    pop r2
-    pop r1
-    pop r0
-
-	; for cross border [top-left, top-right, bottom-left, bottom-right]
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-
-    neg r1  ;r1 = -stride
-    lea r3,[r0-16]
-    lea r3,[r3+r1]    ;last line of top-left border
-
-    lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride
-
-
-    neg r1  ;r1 = stride
-    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
-    imul r6,r1
-
-    lea r5,[r3+r6]    ;last line of bottom-left border
-    lea r6,[r4+r6]    ;last line of botoom-right border
-
-    neg r1 ; r1 = -stride
-
-    ; for left & right border expanding
-    exp_cross_sse2 16,u
-
-    POP_XMM
-    LOAD_4_PARA_POP
-
-    pop r6
-    pop r5
-    pop r4
-
-    %assign push_num 0
-
-
-	ret
--- a/codec/common/expand_picture_common.h
+++ /dev/null
@@ -1,72 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file		expand_pic.h
- *
- * \brief		Interface for expanding reconstructed picture to be used for reference
- *
- * \date		06/08/2009
- *************************************************************************************
- */
-
-#ifndef EXPAND_PICTURE_COMMON_H
-#define EXPAND_PICTURE_COMMON_H
-
-#include "typedefs.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(X86_ASM)
-void ExpandPictureLuma_sse2 (uint8_t* pDst,
-                             const int32_t kiStride,
-                             const int32_t kiPicW,
-                             const int32_t kiPicH);
-void ExpandPictureChromaAlign_sse2 (uint8_t* pDst,
-                                    const int32_t kiStride,
-                                    const int32_t kiPicW,
-                                    const int32_t kiPicH);
-void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
-                                      const int32_t kiStride,
-                                      const int32_t kiPicW,
-                                      const int32_t kiPicH);
-#endif//X86_ASM
-
-#if defined(HAVE_NEON)
-void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
-void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
-#endif
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif
--- a/codec/common/expand_picture_neon.S
+++ /dev/null
@@ -1,137 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
-    stmdb sp!, {r4-r8}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
-
-	add r4, r7, r2
-	sub r4, #1
-    //For the left and right expand
-_expand_picture_luma_loop2:
-	sub r5, r7, #32
-	add r6, r4, #1
-
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
-
-	vst1.8 {q0}, [r5]!
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]!
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_luma_loop2
-
-	//for the top and bottom expand
-	add r2, #64
-	sub r0, #32
-	mla r4, r1, r3, r0
-	sub r4, r1
-_expand_picture_luma_loop0:
-	mov r5, #32
-    mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
-
-	mov r8, #32
-_expand_picture_luma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
-    bne _expand_picture_luma_loop1
-
-	subs r2, #16
-	bne	_expand_picture_luma_loop0
-
-    //vldreq.32 d0, [r0]
-
-	ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
-    stmdb sp!, {r4-r8}
-	//Save the dst
-	mov r7, r0
-	mov r8, r3
-
-	add r4, r7, r2
-	sub r4, #1
-    //For the left and right expand
-_expand_picture_chroma_loop2:
-	sub r5, r7, #16
-	add r6, r4, #1
-
-	vld1.8 {d0[], d1[]}, [r7], r1
-	vld1.8 {d2[], d3[]}, [r4], r1
-
-	vst1.8 {q0}, [r5]
-	vst1.8 {q1}, [r6]
-	subs r8, #1
-	bne	_expand_picture_chroma_loop2
-
-	//for the top and bottom expand
-	add r2, #32
-	sub r0, #16
-	mla r4, r1, r3, r0
-	sub r4, r1
-_expand_picture_chroma_loop0:
-	mov r5, #16
-    mls r5, r5, r1, r0
-	add r6, r4, r1
-	vld1.8 {q0}, [r0]!
-	vld1.8 {q1}, [r4]!
-
-	mov r8, #16
-_expand_picture_chroma_loop1:
-	vst1.8 {q0}, [r5], r1
-	vst1.8 {q1}, [r6], r1
-	subs r8, #1
-    bne _expand_picture_chroma_loop1
-
-	subs r2, #16
-	bne	_expand_picture_chroma_loop0
-
-    //vldreq.32 d0, [r0]
-
-	ldmia sp!, {r4-r8}
-WELS_ASM_FUNC_END
-
-#endif
--- /dev/null
+++ b/codec/common/inc/WelsThreadLib.h
@@ -1,0 +1,132 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	WelsThreadLib.h
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef   _WELS_THREAD_API_H_
+#define   _WELS_THREAD_API_H_
+
+#include "typedefs.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef    HANDLE                    WELS_THREAD_HANDLE;
+typedef    LPTHREAD_START_ROUTINE    LPWELS_THREAD_ROUTINE;
+
+typedef    CRITICAL_SECTION          WELS_MUTEX;
+typedef    HANDLE                    WELS_EVENT;
+
+#define    WELS_THREAD_ROUTINE_TYPE         DWORD  WINAPI
+#define    WELS_THREAD_ROUTINE_RETURN(rc)   return (DWORD)rc;
+
+#else	// NON-WINDOWS
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include <sys/stat.h>
+#include <fcntl.h>
+
+typedef   pthread_t    WELS_THREAD_HANDLE;
+typedef  void* (*LPWELS_THREAD_ROUTINE) (void*);
+
+typedef   pthread_mutex_t           WELS_MUTEX;
+typedef   sem_t*                    WELS_EVENT;
+
+#define   WELS_THREAD_ROUTINE_TYPE         void *
+#define   WELS_THREAD_ROUTINE_RETURN(rc)   return (void*)(intptr_t)rc;
+
+#endif//_WIN32
+
+typedef    int32_t        WELS_THREAD_ERROR_CODE;
+typedef    int32_t        WELS_THREAD_ATTR;
+
+typedef  struct _WelsLogicalProcessorInfo {
+  int32_t    ProcessorCount;
+} WelsLogicalProcessInfo;
+
+#define    WELS_THREAD_ERROR_OK					0
+#define    WELS_THREAD_ERROR_GENERAL			((uint32_t)(-1))
+#define    WELS_THREAD_ERROR_WAIT_OBJECT_0		0
+#define	   WELS_THREAD_ERROR_WAIT_TIMEOUT		((uint32_t)0x00000102L)
+#define	   WELS_THREAD_ERROR_WAIT_FAILED		WELS_THREAD_ERROR_GENERAL
+
+void WelsSleep (uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex);
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex);
+
+WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT* p_event, const char* event_name);
+WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, const char* event_name);
+WELS_THREAD_ERROR_CODE    WelsEventSignal (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventWait (WELS_EVENT* event);
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds);
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount, WELS_EVENT* event_list,
+    WELS_EVENT* master_event = NULL);
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount, WELS_EVENT* event_list,
+    WELS_EVENT* master_event = NULL);
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
+    void* arg, WELS_THREAD_ATTR attr);
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread);
+
+WELS_THREAD_HANDLE        WelsThreadSelf();
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo);
+
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+++ b/codec/common/inc/cpu.h
@@ -1,0 +1,80 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.h
+ *
+ * \brief	CPU feature compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_DETECTION_H__)
+#define WELS_CPU_DETECTION_H__
+
+#include "typedefs.h"
+#include "cpu_core.h"
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+/*
+ *	cpuid support verify routine
+ *  return 0 if cpuid is not supported by cpu
+ */
+int32_t  WelsCPUIdVerify();
+
+void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
+
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
+
+void WelsEmms();
+
+/*
+ *	clear FPU registers states for potential float based calculation if support
+ */
+void     WelsCPURestore (const uint32_t kuiCPU);
+
+#else
+#define WelsEmms()
+#endif
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif//WELS_CPU_DETECTION_H__
--- /dev/null
+++ b/codec/common/inc/cpu_core.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu_core.h
+ *
+ * \brief	cpu core feature detection
+ *
+ * \date	4/24/2009 Created
+ *
+ *************************************************************************************
+ */
+#if !defined(WELS_CPU_CORE_FEATURE_DETECTION_H__)
+#define WELS_CPU_CORE_FEATURE_DETECTION_H__
+
+/*
+ *	WELS CPU feature flags
+ */
+#define WELS_CPU_MMX        0x00000001    /* mmx */
+#define WELS_CPU_MMXEXT     0x00000002    /* mmx-ext*/
+#define WELS_CPU_SSE        0x00000004    /* sse */
+#define WELS_CPU_SSE2       0x00000008    /* sse 2 */
+#define WELS_CPU_SSE3       0x00000010    /* sse 3 */
+#define WELS_CPU_SSE41      0x00000020    /* sse 4.1 */
+#define WELS_CPU_3DNOW      0x00000040    /* 3dnow! */
+#define WELS_CPU_3DNOWEXT   0x00000080    /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC    0x00000100    /* altivec */
+#define WELS_CPU_SSSE3      0x00000200    /* ssse3 */
+#define WELS_CPU_SSE42      0x00000400    /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX		0x00000800	/* Advanced Vector eXtentions */
+#define WELS_CPU_FPU		0x00001000	/* x87-FPU on chip */
+#define WELS_CPU_HTT		0x00002000	/* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
+										   physical processor package is capable of supporting more than one logic processor
+										*/
+#define WELS_CPU_CMOV		0x00004000	/* Conditional Move Instructions,
+										   also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+										*/
+#define WELS_CPU_MOVBE		0x00008000	/* MOVBE instruction */
+#define WELS_CPU_AES		0x00010000	/* AES instruction extensions */
+#define WELS_CPU_FMA		0x00020000	/* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16    0x10000000    /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32    0x20000000    /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64    0x40000000    /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128   0x80000000    /* CacheLine Size 128 */
+
+/* For the android OS */
+#define WELS_CPU_ARMv7      0x000001    /* ARMv7 */
+#define WELS_CPU_VFPv3      0x000002    /* VFPv3 */
+#define WELS_CPU_NEON       0x000004    /* NEON */
+
+/*
+ *	Interfaces for CPU core feature detection as below
+ */
+
+#endif//WELS_CPU_CORE_FEATURE_DETECTION_H__
--- /dev/null
+++ b/codec/common/inc/crt_util_safe_x.h
@@ -1,0 +1,99 @@
+/*!
+ * \copy
+ *     Copyright (c)  2010-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	crt_util_safe_x.h
+ *
+ * \brief	Safe CRT like util for cross platfroms support
+ *
+ * \date	06/04/2010 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+#define WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+
+#if defined(_WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#else
+#include <sys/timeb.h>
+#include <sys/time.h>
+#include "typedefs.h"
+#endif//_WIN32
+
+#include "typedefs.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define     WELS_FILE_SEEK_SET           SEEK_SET
+#define     WELS_FILE_SEEK_CUR           SEEK_CUR
+#define     WESL_FILE_SEEK_END           SEEK_END
+
+typedef      FILE  WelsFileHandle;
+
+#ifdef _WIN32
+typedef      struct _timeb     SWelsTime;
+#else
+typedef      struct timeb      SWelsTime;
+#endif
+
+int32_t   WelsSnprintf (char* buffer,  int32_t sizeOfBuffer,  const char* format, ...);
+char*   WelsStrncpy (char* dest, int32_t sizeInBytes, const char* src);
+char*   WelsStrcat (char* dest, int32_t sizeInBytes, const char* src);
+int32_t   WelsVsnprintf (char* buffer, int32_t sizeOfBuffer, const char* format, va_list argptr);
+
+WelsFileHandle*        WelsFopen (const char* filename,  const char* mode);
+int32_t                WelsFclose (WelsFileHandle*   fp);
+int32_t                WelsFread (void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
+int32_t                WelsFwrite (const void* buffer, int32_t size, int32_t count, WelsFileHandle* fp);
+int32_t                WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin);
+int32_t                WelsFflush (WelsFileHandle* fp);
+
+int32_t                WelsGetTimeOfDay (SWelsTime* tp);
+int32_t                WelsStrftime (char* buffer, int32_t size, const char* format, const SWelsTime* tp);
+uint16_t               WelsGetMillisecond (const SWelsTime* tp);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//WELS_CRT_UTIL_SAFE_CROSS_PLATFORMS_H__
--- /dev/null
+++ b/codec/common/inc/deblocking_common.h
@@ -1,0 +1,55 @@
+#ifndef WELS_DEBLOCKING_COMMON_H__
+#define WELS_DEBLOCKING_COMMON_H__
+#include "typedefs.h"
+void DeblockLumaLt4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockLumaLt4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_c (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* pTc);
+void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* pTc);
+void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#ifdef  X86_ASM
+void DeblockLumaLt4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockLumaTransposeH2V_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pDst);
+void DeblockLumaTransposeV2H_sse2 (uint8_t* pPixY, int32_t iStride, uint8_t* pSrc);
+void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaEq4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4V_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                             int8_t* pTC);
+void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                             int8_t* pTC);
+#endif
+
+#if defined(HAVE_NEON)
+void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
+void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
+void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+
+void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
+void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
+#endif
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif //WELS_DEBLOCKING_COMMON_H__
--- /dev/null
+++ b/codec/common/inc/expand_picture_common.h
@@ -1,0 +1,72 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file		expand_pic.h
+ *
+ * \brief		Interface for expanding reconstructed picture to be used for reference
+ *
+ * \date		06/08/2009
+ *************************************************************************************
+ */
+
+#ifndef EXPAND_PICTURE_COMMON_H
+#define EXPAND_PICTURE_COMMON_H
+
+#include "typedefs.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(X86_ASM)
+void ExpandPictureLuma_sse2 (uint8_t* pDst,
+                             const int32_t kiStride,
+                             const int32_t kiPicW,
+                             const int32_t kiPicH);
+void ExpandPictureChromaAlign_sse2 (uint8_t* pDst,
+                                    const int32_t kiStride,
+                                    const int32_t kiPicW,
+                                    const int32_t kiPicH);
+void ExpandPictureChromaUnalign_sse2 (uint8_t* pDst,
+                                      const int32_t kiStride,
+                                      const int32_t kiPicW,
+                                      const int32_t kiPicH);
+#endif//X86_ASM
+
+#if defined(HAVE_NEON)
+void ExpandPictureLuma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
+void ExpandPictureChroma_neon(uint8_t *pDst, const int32_t kiStride, const int32_t kiPicW, const int32_t kiPicH);
+#endif
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif
--- /dev/null
+++ b/codec/common/inc/logging.h
@@ -1,0 +1,54 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     Copyright (c)  2013, Mozilla
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+
+#ifndef WELS_LOGGING_H__
+#define WELS_LOGGING_H__
+
+// API surface.
+void WelsStderrSetTraceLevel (int32_t level);
+
+
+// Internal details.
+int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap);
+
+template<int level> int32_t welsStderrTrace (
+  const char* format, ...) {
+  va_list ap;
+  va_start (ap, format);
+  welsStderrLevelTrace (level, format, ap);
+  va_end (ap);
+  return 0;
+}
+
+#endif
--- /dev/null
+++ b/codec/common/inc/ls_defines.h
@@ -1,0 +1,93 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef ___LD_ST_MACROS___
+#define ___LD_ST_MACROS___
+
+#include <string.h>
+#include "typedefs.h"
+
+#ifdef __GNUC__
+
+struct tagUnaligned_64 {
+  uint64_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_32 {
+  uint32_t l;
+} __attribute__ ((packed));
+struct tagUnaligned_16 {
+  uint16_t l;
+} __attribute__ ((packed));
+
+#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
+#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
+#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
+//#define _USE_STRUCT_INT_CVT
+//	#ifdef _USE_STRUCT_INT_CVT
+#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
+#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
+#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
+//	#else
+//		inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); }
+//		inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); }
+//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
+//	#endif
+
+#else
+
+//#define INTD16(a) (*((int16_t*)(a)))
+//#define INTD32(a) (*((int32_t*)(a)))
+//#define INTD64(a) (*((int64_t*)(a)))
+
+#define LD16(a) (*((uint16_t*)(a)))
+#define LD32(a) (*((uint32_t*)(a)))
+#define LD64(a) (*((uint64_t*)(a)))
+
+#define ST16(a, b) *((uint16_t*)(a)) = (b)
+#define ST32(a, b) *((uint32_t*)(a)) = (b)
+#define ST64(a, b) *((uint64_t*)(a)) = (b)
+
+#endif /* !__GNUC__ */
+
+#ifndef INTD16
+#define INTD16	LD16
+#endif//INTD16
+
+#ifndef INTD32
+#define INTD32	LD32
+#endif//INTD32
+
+#ifndef INTD64
+#define INTD64	LD64
+#endif//INTD64
+
+#endif//___LD_ST_MACROS___
--- /dev/null
+++ b/codec/common/inc/macros.h
@@ -1,0 +1,274 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	macros.h
+ *
+ * \brief	MACRO based tool utilization
+ *
+ * \date	3/13/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_MACRO_UTILIZATIONS_H__
+#define WELS_MACRO_UTILIZATIONS_H__
+
+#include <math.h>
+#include <assert.h>
+#include "typedefs.h"
+
+/*
+* ENFORCE_STACK_ALIGN_1D: force 1 dimension local data aligned in stack
+* _tp: type
+* _nm: var name
+* _sz: size
+* _al: align bytes
+* auxiliary var: _nm ## _tEmP
+*/
+#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
+	_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
+	_tp *_nm = _nm ## _tEmP + ((_al)-1) - (((uintptr_t)(_nm ## _tEmP + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
+
+
+#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
+	assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
+	_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
+	_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
+	_nm ## _tEmP_al -= (((uintptr_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
+	_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
+
+
+#if defined(_MSC_VER)
+
+#if(_MSC_VER < 1700)
+#define inline	__inline
+#endif
+
+#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
+#define __align16(t,v) __declspec(align(16)) t v
+#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
+	__declspec(align(alignment)) type name[(size)]
+#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
+__declspec(align(alignment)) type name[(sizex)*(sizey)]
+
+#elif defined(__GNUC__)
+
+#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
+#define __align16(t,v) t v __attribute__ ((aligned (16)))
+#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
+	type name[size] __attribute__((aligned(alignment)))
+#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
+	type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
+#endif//_MSC_VER
+
+
+#ifndef	WELS_ALIGN
+#define WELS_ALIGN(x, n)	(((x)+(n)-1)&~((n)-1))
+#endif//WELS_ALIGN
+
+
+#if 1 // Alternative implementation of WELS_MAX and WELS_MIN
+#ifndef WELS_MAX
+#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
+#endif//WELS_MIN
+#else // Alternative implementation of WELS_MAX and WELS_MIN
+#ifndef WELS_MAX
+#define WELS_MAX(x, y)	((x) - (((x)-(y))&(((x)-(y))>>31)))
+#endif//WELS_MAX
+
+#ifndef WELS_MIN
+#define WELS_MIN(x, y)	((y) + (((x)-(y))&(((x)-(y))>>31)))
+#endif//WELS_MIN
+#endif // Alternative implementation of WELS_MAX and WELS_MIN
+
+
+#ifndef WELS_CEIL
+#define WELS_CEIL(x)	ceil(x)	// FIXME: low complexity instead of math library used
+#endif//WELS_CEIL
+
+#ifndef WELS_FLOOR
+#define WELS_FLOOR(x)	floor(x)	// FIXME: low complexity instead of math library used
+#endif//WELS_FLOOR
+
+#ifndef WELS_ROUND
+#define WELS_ROUND(x)	((int32_t)(0.5f+(x)))
+#endif//WELS_ROUND
+
+#define WELS_NON_ZERO_COUNT_AVERAGE(nC,nA,nB) {		\
+    nC = nA + nB + 1;                      \
+	nC >>= (uint8_t)( nA != -1 && nB != -1);        \
+	nC += (uint8_t)(nA == -1 && nB == -1);           \
+}
+
+static inline int32_t CeilLog2 (int32_t i) {
+int32_t s = 0;
+i--;
+while (i > 0) {
+  s++;
+  i >>= 1;
+}
+return s;
+}
+/*
+the second path will degrades the performance
+*/
+#if 1
+static inline int32_t WelsMedian (int32_t iX,  int32_t iY, int32_t iZ) {
+int32_t iMin = iX, iMax = iX;
+
+if (iY < iMin)
+  iMin	= iY;
+else
+  iMax = iY;
+
+if (iZ < iMin)
+  iMin	= iZ;
+else if (iZ > iMax)
+  iMax	= iZ;
+
+return (iX + iY + iZ) - (iMin + iMax);
+}
+#else
+static inline int32_t WelsMedian (int32_t iX,  int32_t iY, int32_t iZ) {
+int32_t iTmp = (iX - iY) & ((iX - iY) >> 31);
+iX -= iTmp;
+iY += iTmp;
+iY -= (iY - iZ) & ((iY - iZ) >> 31);
+iY += (iX - iY) & ((iX - iY) >> 31);
+return iY;
+}
+
+#endif
+
+#ifndef NEG_NUM
+//#define NEG_NUM( num ) (-num)
+#define NEG_NUM(iX) (1+(~(iX)))
+#endif// NEG_NUM
+
+static inline uint8_t WelsClip1(int32_t iX) {
+  uint8_t uiTmp = (uint8_t)(((iX) & ~255) ? (-(iX) >> 31) : (iX));
+  return uiTmp;
+}
+
+#ifndef WELS_SIGN
+#define WELS_SIGN(iX) ((int32_t)(iX) >> 31)
+#endif //WELS_SIGN
+#ifndef WELS_ABS
+#define WELS_ABS(iX) ((WELS_SIGN(iX) ^ (int32_t)(iX)) - WELS_SIGN(iX))
+#endif //WELS_ABS
+
+// WELS_CLIP3
+#ifndef WELS_CLIP3
+#define WELS_CLIP3(iX, iY, iZ) ((iX) < (iY) ? (iY) : ((iX) > (iZ) ? (iZ) : (iX)))
+#endif //WELS_CLIP3
+
+/*
+ * Description: to check variable validation and return the specified result
+ *	iResult:	value to be checked
+ *	iExpected:	the expected value
+ */
+#ifndef WELS_VERIFY_RETURN_IFNEQ
+#define WELS_VERIFY_RETURN_IFNEQ(iResult, iExpected) \
+	if ( iResult != iExpected ){ \
+		return iResult; \
+	}
+#endif//#if WELS_VERIFY_RETURN_IF
+
+/*
+ * Description: to check variable validation and return the specified result
+ *	iResult:	value to be return
+ *	bCaseIf:	negative condition to be verified
+ */
+#ifndef WELS_VERIFY_RETURN_IF
+#define WELS_VERIFY_RETURN_IF(iResult, bCaseIf) \
+	if ( bCaseIf ){ \
+		return iResult; \
+	}
+#endif//#if WELS_VERIFY_RETURN_IF
+
+/*
+ *	Description: to check variable validation and return the specified result
+ *		with correspoinding process advance.
+ *	 result:	value to be return
+ *	 case_if:	negative condition to be verified
+ *	 proc:		process need perform
+ */
+#ifndef WELS_VERIFY_RETURN_PROC_IF
+#define WELS_VERIFY_RETURN_PROC_IF(iResult, bCaseIf, fProc) \
+	if ( bCaseIf ){ \
+		fProc;	\
+		return iResult;	\
+	}
+#endif//#if WELS_VERIFY_RETURN_PROC_IF
+
+static inline int32_t WELS_LOG2 (uint32_t v) {
+int32_t r = 0;
+while (v >>= 1) {
+  ++r;
+}
+return r;
+
+}
+
+#define CLIP3_QP_0_51(q)		WELS_CLIP3(q, 0, 51)	// ((q) < (0) ? (0) : ((q) > (51) ? (51) : (q)))
+#define   CALC_BI_STRIDE(width,bitcount)  ((((width * bitcount) + 31) & ~31) >> 3)
+
+
+
+
+#ifndef BUTTERFLY1x2
+#define BUTTERFLY1x2(b) (((b)<<8) | (b))
+#endif//BUTTERFLY1x2
+
+#ifndef BUTTERFLY2x4
+#define BUTTERFLY2x4(wd) (((uint32_t)(wd)<<16) |(wd))
+#endif//BUTTERFLY2x4
+
+#ifndef BUTTERFLY4x8
+#define BUTTERFLY4x8(dw) (((uint64_t)(dw)<<32) | (dw))
+#endif//BUTTERFLY4x8
+
+static inline bool WELS_POWER2_IF (uint32_t v) {
+return (v && ! (v & (v - 1)));
+}
+
+#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
+#define WELS_GCC_UNUSED  __attribute__((__unused__))
+#else
+#define WELS_GCC_UNUSED
+#endif
+
+
+
+#endif//WELS_MACRO_UTILIZATIONS_H__
--- /dev/null
+++ b/codec/common/inc/mc_common.h
@@ -1,0 +1,161 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#ifndef MC_COMMON_H
+#define MC_COMMON_H
+
+#include "typedefs.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif//__cplusplus
+
+#if defined(HAVE_NEON)
+void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+
+void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
+
+void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
+
+void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
+void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
+void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
+void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
+
+void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
+
+void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
+
+void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
+
+void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
+#endif
+
+#if defined(X86_ASM)
+//***************************************************************************//
+//                       MMXEXT definition                                   //
+//***************************************************************************//
+void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             int32_t iHeight);
+void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
+void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                         int32_t iHeight);
+void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+//***************************************************************************//
+//                       SSE2 definition                                     //
+//***************************************************************************//
+void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                            const uint8_t* kpABCD, int32_t iHeight);
+void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                           int32_t iHeight);
+void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                               int32_t iHeight);
+void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                              int32_t iHeight);
+void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                                    int32_t iHeight);
+void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                        int32_t iWidth, int32_t iHeight);
+void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+                                         int32_t iWidth, int32_t iHeight);
+
+void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
+                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
+
+void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                int32_t iHeight);
+
+void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+                                 int32_t iHeight);
+
+void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
+                              int32_t iHeight);
+
+//***************************************************************************//
+//                       SSSE3 definition                                    //
+//***************************************************************************//
+
+void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
+                             const uint8_t* kpABCD, int32_t iHeight);
+
+#endif //X86_ASM
+
+#if defined(__cplusplus)
+}
+#endif//__cplusplus
+
+#endif//MC_COMMON_H
--- /dev/null
+++ b/codec/common/inc/measure_time.h
@@ -1,0 +1,89 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	measure_time.h
+ *
+ * \brief	time cost measure utilization
+ *
+ * \date	04/28/2009 Created
+ *
+ *************************************************************************************
+ */
+#ifndef WELS_TIME_COST_MEASURE_UTIL_H__
+#define WELS_TIME_COST_MEASURE_UTIL_H__
+
+#include <stdlib.h>
+
+#include "typedefs.h"
+#ifndef _WIN32
+#include <sys/time.h>
+#else
+#include <windows.h>
+#include <sys/timeb.h>
+#endif
+#include <time.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif//__cplusplus
+
+/*!
+ * \brief	time cost measure utilization
+ * \param	void
+ * \return	time elapsed since run (unit: microsecond)
+ */
+
+static inline int64_t WelsTime (void) {
+#ifndef _WIN32
+  struct timeval tv_date;
+
+  gettimeofday (&tv_date, NULL);
+  return ((int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec);
+#else
+  static int64_t iMtimeFreq = 0;
+  int64_t iMtimeCur = 0;
+  int64_t iResult = 0;
+  if (!iMtimeFreq) {
+    QueryPerformanceFrequency ((LARGE_INTEGER*)&iMtimeFreq);
+    if (!iMtimeFreq)
+      iMtimeFreq = 1;
+  }
+  QueryPerformanceCounter ((LARGE_INTEGER*)&iMtimeCur);
+  iResult = (int64_t) ((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
+  return iResult;
+#endif//WIN32
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//WELS_TIME_COST_MEASURE_UTIL_H__
--- /dev/null
+++ b/codec/common/inc/typedefs.h
@@ -1,0 +1,74 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+// typedef.h
+#ifndef WELS_TYPE_DEFINES_H__
+#define WELS_TYPE_DEFINES_H__
+
+#include <limits.h>
+#include <stddef.h>
+
+////////////////////////////////////////////////////////////////////////////
+// NOTICE : ALL internal implement MUST use the data type defined as below
+//          ONLY except with the interface file !!!!!
+////////////////////////////////////////////////////////////////////////////
+
+#ifndef  _MSC_VER
+
+#define __STDC_FORMAT_MACROS
+#include <stdint.h>
+#include <inttypes.h>
+
+#else
+
+// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.
+typedef signed char      int8_t  ;
+typedef unsigned char    uint8_t ;
+typedef short            int16_t ;
+typedef unsigned short   uint16_t;
+typedef int              int32_t ;
+typedef unsigned int     uint32_t;
+typedef __int64          int64_t ;
+typedef unsigned __int64 uint64_t;
+#define PRId64 "I64d"
+
+#endif // _MSC_VER defined
+
+// The 'float' type is portable and usable without any need for any extra typedefs.
+
+#ifdef EPSN
+#undef EPSN
+#endif//EPSN
+#define EPSN	  (0.000001f) // (1e-6)	// desired float precision
+
+#endif //WELS_TYPE_DEFINES_H__
+
--- a/codec/common/logging.cpp
+++ /dev/null
@@ -1,49 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     Copyright (c)  2013, Mozilla
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include <stdarg.h>
-#include <stdio.h>
-#include "typedefs.h"
-
-static int32_t g_TraceLevel = 0;
-
-void WelsStderrSetTraceLevel (int32_t level) {
-  g_TraceLevel = level;
-}
-
-int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap) {
-  if (level < g_TraceLevel) {
-    vfprintf (stderr, format, ap);
-  }
-  return 0;
-}
--- a/codec/common/logging.h
+++ /dev/null
@@ -1,54 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     Copyright (c)  2013, Mozilla
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-
-#ifndef WELS_LOGGING_H__
-#define WELS_LOGGING_H__
-
-// API surface.
-void WelsStderrSetTraceLevel (int32_t level);
-
-
-// Internal details.
-int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap);
-
-template<int level> int32_t welsStderrTrace (
-  const char* format, ...) {
-  va_list ap;
-  va_start (ap, format);
-  welsStderrLevelTrace (level, format, ap);
-  va_end (ap);
-  return 0;
-}
-
-#endif
--- a/codec/common/ls_defines.h
+++ /dev/null
@@ -1,93 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef ___LD_ST_MACROS___
-#define ___LD_ST_MACROS___
-
-#include <string.h>
-#include "typedefs.h"
-
-#ifdef __GNUC__
-
-struct tagUnaligned_64 {
-  uint64_t l;
-} __attribute__ ((packed));
-struct tagUnaligned_32 {
-  uint32_t l;
-} __attribute__ ((packed));
-struct tagUnaligned_16 {
-  uint16_t l;
-} __attribute__ ((packed));
-
-#define LD16(a) (((struct tagUnaligned_16 *) (a))->l)
-#define LD32(a) (((struct tagUnaligned_32 *) (a))->l)
-#define LD64(a) (((struct tagUnaligned_64 *) (a))->l)
-//#define _USE_STRUCT_INT_CVT
-//	#ifdef _USE_STRUCT_INT_CVT
-#define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b)
-#define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b)
-#define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b)
-//	#else
-//		inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); }
-//		inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); }
-//inline void __ST64(void *dst, uint64_t v) { memcpy(dst, &v, 8); }
-//	#endif
-
-#else
-
-//#define INTD16(a) (*((int16_t*)(a)))
-//#define INTD32(a) (*((int32_t*)(a)))
-//#define INTD64(a) (*((int64_t*)(a)))
-
-#define LD16(a) (*((uint16_t*)(a)))
-#define LD32(a) (*((uint32_t*)(a)))
-#define LD64(a) (*((uint64_t*)(a)))
-
-#define ST16(a, b) *((uint16_t*)(a)) = (b)
-#define ST32(a, b) *((uint32_t*)(a)) = (b)
-#define ST64(a, b) *((uint64_t*)(a)) = (b)
-
-#endif /* !__GNUC__ */
-
-#ifndef INTD16
-#define INTD16	LD16
-#endif//INTD16
-
-#ifndef INTD32
-#define INTD32	LD32
-#endif//INTD32
-
-#ifndef INTD64
-#define INTD64	LD64
-#endif//INTD64
-
-#endif//___LD_ST_MACROS___
--- a/codec/common/macros.h
+++ /dev/null
@@ -1,274 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	macros.h
- *
- * \brief	MACRO based tool utilization
- *
- * \date	3/13/2009 Created
- *
- *************************************************************************************
- */
-#ifndef WELS_MACRO_UTILIZATIONS_H__
-#define WELS_MACRO_UTILIZATIONS_H__
-
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-
-/*
-* ENFORCE_STACK_ALIGN_1D: force 1 dimension local data aligned in stack
-* _tp: type
-* _nm: var name
-* _sz: size
-* _al: align bytes
-* auxiliary var: _nm ## _tEmP
-*/
-#define ENFORCE_STACK_ALIGN_1D(_tp, _nm, _sz, _al) \
-	_tp _nm ## _tEmP[(_sz)+(_al)-1]; \
-	_tp *_nm = _nm ## _tEmP + ((_al)-1) - (((uintptr_t)(_nm ## _tEmP + ((_al)-1)) & ((_al)-1))/sizeof(_tp));
-
-
-#define ENFORCE_STACK_ALIGN_2D(_tp, _nm, _cx, _cy, _al) \
-	assert( ((_al) && !((_al) & ((_al) - 1))) && ((_al) >= sizeof(_tp)) ); /*_al should be power-of-2 and >= sizeof(_tp)*/\
-	_tp _nm ## _tEmP[(_cx)*(_cy)+(_al)/sizeof(_tp)-1]; \
-	_tp *_nm ## _tEmP_al = _nm ## _tEmP + ((_al)/sizeof(_tp)-1); \
-	_nm ## _tEmP_al -= (((uintptr_t)_nm ## _tEmP_al & ((_al)-1))/sizeof(_tp)); \
-	_tp (*_nm)[(_cy)] = (_tp (*)[(_cy)])_nm ## _tEmP_al;
-
-
-#if defined(_MSC_VER)
-
-#if(_MSC_VER < 1700)
-#define inline	__inline
-#endif
-
-#define ALIGNED_DECLARE( type, var, n ) __declspec(align(n)) type var
-#define __align16(t,v) __declspec(align(16)) t v
-#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
-	__declspec(align(alignment)) type name[(size)]
-#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
-__declspec(align(alignment)) type name[(sizex)*(sizey)]
-
-#elif defined(__GNUC__)
-
-#define ALIGNED_DECLARE( type, var, n ) type var __attribute__((aligned(n)))
-#define __align16(t,v) t v __attribute__ ((aligned (16)))
-#define ALIGNED_DECLARE_MATRIX_1D(name,size,type,alignment) \
-	type name[size] __attribute__((aligned(alignment)))
-#define ALIGNED_DECLARE_MATRIX_2D(name,sizex,sizey,type,alignment) \
-	type name[(sizex)*(sizey)] __attribute__((aligned(alignment)))
-#endif//_MSC_VER
-
-
-#ifndef	WELS_ALIGN
-#define WELS_ALIGN(x, n)	(((x)+(n)-1)&~((n)-1))
-#endif//WELS_ALIGN
-
-
-#if 1 // Alternative implementation of WELS_MAX and WELS_MIN
-#ifndef WELS_MAX
-#define WELS_MAX(x, y)	((x) > (y) ? (x) : (y))
-#endif//WELS_MAX
-
-#ifndef WELS_MIN
-#define WELS_MIN(x, y)	((x) < (y) ? (x) : (y))
-#endif//WELS_MIN
-#else // Alternative implementation of WELS_MAX and WELS_MIN
-#ifndef WELS_MAX
-#define WELS_MAX(x, y)	((x) - (((x)-(y))&(((x)-(y))>>31)))
-#endif//WELS_MAX
-
-#ifndef WELS_MIN
-#define WELS_MIN(x, y)	((y) + (((x)-(y))&(((x)-(y))>>31)))
-#endif//WELS_MIN
-#endif // Alternative implementation of WELS_MAX and WELS_MIN
-
-
-#ifndef WELS_CEIL
-#define WELS_CEIL(x)	ceil(x)	// FIXME: low complexity instead of math library used
-#endif//WELS_CEIL
-
-#ifndef WELS_FLOOR
-#define WELS_FLOOR(x)	floor(x)	// FIXME: low complexity instead of math library used
-#endif//WELS_FLOOR
-
-#ifndef WELS_ROUND
-#define WELS_ROUND(x)	((int32_t)(0.5f+(x)))
-#endif//WELS_ROUND
-
-#define WELS_NON_ZERO_COUNT_AVERAGE(nC,nA,nB) {		\
-    nC = nA + nB + 1;                      \
-	nC >>= (uint8_t)( nA != -1 && nB != -1);        \
-	nC += (uint8_t)(nA == -1 && nB == -1);           \
-}
-
-static inline int32_t CeilLog2 (int32_t i) {
-int32_t s = 0;
-i--;
-while (i > 0) {
-  s++;
-  i >>= 1;
-}
-return s;
-}
-/*
-the second path will degrades the performance
-*/
-#if 1
-static inline int32_t WelsMedian (int32_t iX,  int32_t iY, int32_t iZ) {
-int32_t iMin = iX, iMax = iX;
-
-if (iY < iMin)
-  iMin	= iY;
-else
-  iMax = iY;
-
-if (iZ < iMin)
-  iMin	= iZ;
-else if (iZ > iMax)
-  iMax	= iZ;
-
-return (iX + iY + iZ) - (iMin + iMax);
-}
-#else
-static inline int32_t WelsMedian (int32_t iX,  int32_t iY, int32_t iZ) {
-int32_t iTmp = (iX - iY) & ((iX - iY) >> 31);
-iX -= iTmp;
-iY += iTmp;
-iY -= (iY - iZ) & ((iY - iZ) >> 31);
-iY += (iX - iY) & ((iX - iY) >> 31);
-return iY;
-}
-
-#endif
-
-#ifndef NEG_NUM
-//#define NEG_NUM( num ) (-num)
-#define NEG_NUM(iX) (1+(~(iX)))
-#endif// NEG_NUM
-
-static inline uint8_t WelsClip1(int32_t iX) {
-  uint8_t uiTmp = (uint8_t)(((iX) & ~255) ? (-(iX) >> 31) : (iX));
-  return uiTmp;
-}
-
-#ifndef WELS_SIGN
-#define WELS_SIGN(iX) ((int32_t)(iX) >> 31)
-#endif //WELS_SIGN
-#ifndef WELS_ABS
-#define WELS_ABS(iX) ((WELS_SIGN(iX) ^ (int32_t)(iX)) - WELS_SIGN(iX))
-#endif //WELS_ABS
-
-// WELS_CLIP3
-#ifndef WELS_CLIP3
-#define WELS_CLIP3(iX, iY, iZ) ((iX) < (iY) ? (iY) : ((iX) > (iZ) ? (iZ) : (iX)))
-#endif //WELS_CLIP3
-
-/*
- * Description: to check variable validation and return the specified result
- *	iResult:	value to be checked
- *	iExpected:	the expected value
- */
-#ifndef WELS_VERIFY_RETURN_IFNEQ
-#define WELS_VERIFY_RETURN_IFNEQ(iResult, iExpected) \
-	if ( iResult != iExpected ){ \
-		return iResult; \
-	}
-#endif//#if WELS_VERIFY_RETURN_IF
-
-/*
- * Description: to check variable validation and return the specified result
- *	iResult:	value to be return
- *	bCaseIf:	negative condition to be verified
- */
-#ifndef WELS_VERIFY_RETURN_IF
-#define WELS_VERIFY_RETURN_IF(iResult, bCaseIf) \
-	if ( bCaseIf ){ \
-		return iResult; \
-	}
-#endif//#if WELS_VERIFY_RETURN_IF
-
-/*
- *	Description: to check variable validation and return the specified result
- *		with correspoinding process advance.
- *	 result:	value to be return
- *	 case_if:	negative condition to be verified
- *	 proc:		process need perform
- */
-#ifndef WELS_VERIFY_RETURN_PROC_IF
-#define WELS_VERIFY_RETURN_PROC_IF(iResult, bCaseIf, fProc) \
-	if ( bCaseIf ){ \
-		fProc;	\
-		return iResult;	\
-	}
-#endif//#if WELS_VERIFY_RETURN_PROC_IF
-
-static inline int32_t WELS_LOG2 (uint32_t v) {
-int32_t r = 0;
-while (v >>= 1) {
-  ++r;
-}
-return r;
-
-}
-
-#define CLIP3_QP_0_51(q)		WELS_CLIP3(q, 0, 51)	// ((q) < (0) ? (0) : ((q) > (51) ? (51) : (q)))
-#define   CALC_BI_STRIDE(width,bitcount)  ((((width * bitcount) + 31) & ~31) >> 3)
-
-
-
-
-#ifndef BUTTERFLY1x2
-#define BUTTERFLY1x2(b) (((b)<<8) | (b))
-#endif//BUTTERFLY1x2
-
-#ifndef BUTTERFLY2x4
-#define BUTTERFLY2x4(wd) (((uint32_t)(wd)<<16) |(wd))
-#endif//BUTTERFLY2x4
-
-#ifndef BUTTERFLY4x8
-#define BUTTERFLY4x8(dw) (((uint64_t)(dw)<<32) | (dw))
-#endif//BUTTERFLY4x8
-
-static inline bool WELS_POWER2_IF (uint32_t v) {
-return (v && ! (v & (v - 1)));
-}
-
-#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ > 4)
-#define WELS_GCC_UNUSED  __attribute__((__unused__))
-#else
-#define WELS_GCC_UNUSED
-#endif
-
-
-
-#endif//WELS_MACRO_UTILIZATIONS_H__
--- a/codec/common/mb_copy.asm
+++ /dev/null
@@ -1,581 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mb_copy.asm
-;*
-;*  Abstract
-;*      mb_copy and mb_copy1
-;*
-;*  History
-;*      15/09/2009 Created
-;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-
-;***********************************************************************
-; void WelsCopy16x16_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy16x16_sse2
-
-	push r4
-	push r5
-	%assign  push_num 2
-    LOAD_4_PARA
-    PUSH_XMM 8
-
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
-
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
-
-	movdqa xmm0, [r2]
-	movdqa xmm1, [r2+r3]
-	movdqa xmm2, [r2+2*r3]
-	movdqa xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqa xmm4, [r2]
-	movdqa xmm5, [r2+r3]
-	movdqa xmm6, [r2+2*r3]
-	movdqa xmm7, [r2+r5]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
-
-;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-	push r4
-	push r5
-	%assign  push_num 2
-    LOAD_4_PARA
-    PUSH_XMM 8
-
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
-
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-	lea r2, [r2+4*r3]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	lea r0, [r0+4*r1]
-
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
-
-; , 12/29/2011
-;***********************************************************************
-; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-;							int32_t  iStrideD,
-;							uint8_t* Src,
-;							int32_t  iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy16x8NotAligned_sse2
-	push r4
-	push r5
-	%assign  push_num 2
-    LOAD_4_PARA
-    PUSH_XMM 8
-
-	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
-	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
-
-	movdqu xmm0, [r2]
-	movdqu xmm1, [r2+r3]
-	movdqu xmm2, [r2+2*r3]
-	movdqu xmm3, [r2+r5]
-	lea r2, [r2+4*r3]
-	movdqu xmm4, [r2]
-	movdqu xmm5, [r2+r3]
-	movdqu xmm6, [r2+2*r3]
-	movdqu xmm7, [r2+r5]
-
-	movdqa [r0], xmm0
-	movdqa [r0+r1], xmm1
-	movdqa [r0+2*r1], xmm2
-	movdqa [r0+r4], xmm3
-	lea r0, [r0+4*r1]
-	movdqa [r0], xmm4
-	movdqa [r0+r1], xmm5
-	movdqa [r0+2*r1], xmm6
-	movdqa [r0+r4], xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	pop r5
-	pop r4
-	ret
-
-
-;***********************************************************************
-; void WelsCopy8x16_mmx(uint8_t* Dst,
-;                       int32_t  iStrideD,
-;                       uint8_t* Src,
-;                       int32_t  iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy8x16_mmx
-	%assign  push_num 0
-    LOAD_4_PARA
-
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-	lea r2, [r2+2*r3]
-
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-	lea r0, [r0+2*r1]
-
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-
-	WELSEMMS
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-; void WelsCopy8x8_mmx(  uint8_t* Dst,
-;                        int32_t  iStrideD,
-;                        uint8_t* Src,
-;                        int32_t  iStrideS )
-;***********************************************************************
-WELS_EXTERN WelsCopy8x8_mmx
-	push r4
-	%assign  push_num 1
-    LOAD_4_PARA
-	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
-
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm0, [r2]
-	movq mm1, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm2, [r2]
-	movq mm3, [r2+r3]
-	lea r2, [r2+2*r3]
-	; to prefetch next loop
-	prefetchnta [r2+2*r3]
-	prefetchnta [r2+r4]
-	movq mm4, [r2]
-	movq mm5, [r2+r3]
-	lea r2, [r2+2*r3]
-	movq mm6, [r2]
-	movq mm7, [r2+r3]
-
-	movq [r0], mm0
-	movq [r0+r1], mm1
-	lea r0, [r0+2*r1]
-	movq [r0], mm2
-	movq [r0+r1], mm3
-	lea r0, [r0+2*r1]
-	movq [r0], mm4
-	movq [r0+r1], mm5
-	lea r0, [r0+2*r1]
-	movq [r0], mm6
-	movq [r0+r1], mm7
-
-	WELSEMMS
-	LOAD_4_PARA_POP
-	pop r4
-	ret
-
-; (dunhuang@cisco), 12/21/2011
-;***********************************************************************
-; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
-;***********************************************************************
-WELS_EXTERN UpdateMbMv_sse2
-
-    %assign  push_num 0
-    LOAD_2_PARA
-
-	movd xmm0, r1d	; _mv
-	pshufd xmm1, xmm0, $00
-	movdqa [r0     ], xmm1
-	movdqa [r0+0x10], xmm1
-	movdqa [r0+0x20], xmm1
-	movdqa [r0+0x30], xmm1
-	ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-
-
-;*******************************************************************************
-; void PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
-;                           uint8_t *pSrcA, int iSrcAStride,
-;                           uint8_t *pSrcB, int iSrcBStride,
-;                           int iHeight );
-;*******************************************************************************
-WELS_EXTERN PixelAvgWidthEq4_mmx
-
-    %assign  push_num 0
-    LOAD_7_PARA
-
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
-
-ALIGN 4
-.height_loop:
-	movd        mm0, [r4]
-    pavgb       mm0, [r2]
-    movd        [r0], mm0
-
-    dec         r6
-    lea         r0, [r0+r1]
-    lea         r2, [r2+r3]
-    lea         r4, [r4+r5]
-    jne         .height_loop
-
-	WELSEMMS
-	LOAD_7_PARA_POP
-    ret
-
-
-;*******************************************************************************
-; void PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
-;                           uint8_t *pSrcA, int iSrcAStride,
-;                           uint8_t *pSrcB, int iSrcBStride,
-;                           int iHeight );
-;*******************************************************************************
-WELS_EXTERN PixelAvgWidthEq8_mmx
-    %assign  push_num 0
-    LOAD_7_PARA
-
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
-
-ALIGN 4
-.height_loop:
-	movq        mm0, [r2]
-    pavgb       mm0, [r4]
-    movq        [r0], mm0
-    movq        mm0, [r2+r3]
-    pavgb       mm0, [r4+r5]
-    movq		[r0+r1], mm0
-
-    lea			r2,  [r2+2*r3]
-    lea			r4,  [r4+2*r5]
-    lea			r0,  [r0+2*r1]
-
-    sub         r6, 2
-    jnz         .height_loop
-
-	WELSEMMS
-	LOAD_7_PARA_POP
-    ret
-
-
-
-;*******************************************************************************
-; void PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
-;                          uint8_t *pSrcA, int iSrcAStride,
-;                          uint8_t *pSrcB, int iSrcBStride,
-;                          int iHeight );
-;*******************************************************************************
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-    %assign  push_num 0
-    LOAD_7_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-	SIGN_EXTENSION	r6, r6d
-ALIGN 4
-.height_loop:
-	movdqu      xmm0, [r2]
-	movdqu	    xmm1, [r4]
-	pavgb	    xmm0, xmm1
-	;pavgb       xmm0, [r4]
-    movdqu      [r0], xmm0
-
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
-    movdqu      [r0+r1], xmm0
-
-	movdqu      xmm0, [r2+2*r3]
-	movdqu       xmm1, [r4+2*r5]
-	pavgb	    xmm0, xmm1
-    movdqu      [r0+2*r1], xmm0
-
-    lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
-
-	movdqu      xmm0, [r2+r3]
-	movdqu      xmm1, [r4+r5]
-	pavgb	    xmm0, xmm1
-    movdqu      [r0+r1], xmm0
-
-    lea         r2, [r2+2*r3]
-    lea			r4, [r4+2*r5]
-    lea			r0, [r0+2*r1]
-
-    sub         r6, 4
-    jne         .height_loop
-
-	WELSEMMS
-	LOAD_7_PARA_POP
-    ret
-
-;*******************************************************************************
-;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-;                          uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-WELS_EXTERN McCopyWidthEq4_mmx
-    push	r5
-    %assign  push_num 1
-    LOAD_5_PARA
-
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-
-ALIGN 4
-.height_loop:
-	mov r5d, [r0]
-	mov [r2], r5d
-
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-	WELSEMMS
-    LOAD_5_PARA_POP
-    pop	   r5
-    ret
-
-;*******************************************************************************
-;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-;                           uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-WELS_EXTERN McCopyWidthEq8_mmx
-    %assign  push_num 0
-    LOAD_5_PARA
-
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-
-ALIGN 4
-.height_loop:
-	movq mm0, [r0]
-	movq [r2], mm0
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-
-	WELSEMMS
-	LOAD_5_PARA_POP
-    ret
-
-
-;*******************************************************************************
-;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-;read unaligned memory
-%macro SSE_READ_UNA 2
-	movq	%1, [%2]
-	movhps	%1,	[%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE_WRITE_UNA 2
-	movq	[%1],	%2
-	movhps	[%1+8], %2
-%endmacro
-WELS_EXTERN McCopyWidthEq16_sse2
-    %assign  push_num 0
-    LOAD_5_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-ALIGN 4
-.height_loop:
-    SSE_READ_UNA	xmm0, r0
-    SSE_READ_UNA	xmm1, r0+r1
-    SSE_WRITE_UNA	r2, xmm0
-    SSE_WRITE_UNA	r2+r3, xmm1
-
-	sub		r4,	2
-    lea     r0, [r0+r1*2]
-    lea     r2, [r2+r3*2]
-    jnz     .height_loop
-
-	LOAD_5_PARA_POP
-    ret
--- a/codec/common/mc_chroma.asm
+++ /dev/null
@@ -1,293 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( const uint8_t *src,
-;							int32_t iSrcStride,
-;							uint8_t *pDst,
-;							int32_t iDstStride,
-;							const uint8_t *pABCD,
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-	%assign  push_num 0
-	LOAD_6_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-
-	movd mm3, [r4];	[eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3
-	punpckhwd mm4, mm4
-
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movd mm0, [r0]
-	movd mm1, [r0+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-
-	movd  mm1, [r4]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-
-	movd mm1, [r4+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [r2], mm0
-
-	movq mm0, mm2
-
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
-
-	dec r5
-	jnz near .xloop
-	WELSEMMS
-	LOAD_6_PARA_POP
-	ret
-
-
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
-;						int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						const uint8_t *pABCD,
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-	%assign  push_num 0
-	LOAD_6_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-
-	movd xmm3, [r4]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-
-	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
-	movq xmm0, [r0]
-	movq xmm1, [r0+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-
-	movq  xmm1, [r4]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-
-	movq xmm1, [r4+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
-
-	movdqa xmm0, xmm2
-
-	lea r2, [r2 + r3]
-	lea r4, [r4 + r1]
-
-	dec r5
-	jnz near .xloop
-
-	POP_XMM
-	LOAD_6_PARA_POP
-
-	ret
-
-
-
-
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
-;						 int32_t iSrcStride,
-;                        uint8_t *pDst,
-;                        int32_t iDstStride,
-;                        const uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-	%assign  push_num 0
-	LOAD_6_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r5, r5d
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [r4]
-    punpcklwd xmm5, xmm5
-    punpckldq xmm5, xmm5
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6
-
-    sub r2, r3 ;sub esi, edi
-    sub r2, r3
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [r0]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-
-.hloop_chroma:
-	lea	r2, [r2+2*r3]
-
-	movdqu xmm2, [r0+r1]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [r2],xmm0
-
-    lea r0, [r0+2*r1]
-    movdqu xmm2, [r0]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [r2+r3],xmm4
-
-	sub r5, 2
-	jnz .hloop_chroma
-
-	POP_XMM
-	LOAD_6_PARA_POP
-
-	ret
-
-
--- a/codec/common/mc_common.h
+++ /dev/null
@@ -1,161 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifndef MC_COMMON_H
-#define MC_COMMON_H
-
-#include "typedefs.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif//__cplusplus
-
-#if defined(HAVE_NEON)
-void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-
-void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
-
-void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
-
-void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-    //horizontal filter to gain half sample, that is (2, 0) location in quarter sample
-void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-    //vertical filter to gain half sample, that is (0, 2) location in quarter sample
-void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-    //horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
-void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
-
-void PixStrideAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-void PixStrideAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcStrideA, const uint8_t* pSrcB, int32_t iSrcStrideB, int32_t iHeight);
-
-void McHorVer20Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-void McHorVer20Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// width+1
-
-void McHorVer02Height17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-void McHorVer02Height9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);// height+1
-
-void McHorVer22Width17_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-void McHorVer22Width9_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);//width+1&&height+1
-#endif
-
-#if defined(X86_ASM)
-//***************************************************************************//
-//                       MMXEXT definition                                   //
-//***************************************************************************//
-void McHorVer20WidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             int32_t iHeight);
-void McChromaWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                           const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq4_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
-void McCopyWidthEq8_mmx (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                         int32_t iHeight);
-void PixelAvgWidthEq4_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-void PixelAvgWidthEq8_mmx (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                           const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-//***************************************************************************//
-//                       SSE2 definition                                     //
-//***************************************************************************//
-void McChromaWidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                            const uint8_t* kpABCD, int32_t iHeight);
-void McCopyWidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                           int32_t iHeight);
-void McHorVer20WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer20WidthEq16_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                               int32_t iHeight);
-void McHorVer02WidthEq8_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                              int32_t iHeight);
-void McHorVer22Width8HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                                    int32_t iHeight);
-void McHorVer22Width8VerLastAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-                                        int32_t iWidth, int32_t iHeight);
-void McHorVer22Width8VerLastUnAlign_sse2 (const uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
-                                         int32_t iWidth, int32_t iHeight);
-
-void PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, const uint8_t* pSrcA, int32_t iSrcAStride,
-                             const uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
-
-void McHorVer20Width9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                int32_t iHeight);
-
-void McHorVer02Height9Or17_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
-                                 int32_t iHeight);
-
-void McHorVer22HorFirst_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
-                              int32_t iHeight);
-
-//***************************************************************************//
-//                       SSSE3 definition                                    //
-//***************************************************************************//
-
-void McChromaWidthEq8_ssse3 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
-                             const uint8_t* kpABCD, int32_t iHeight);
-
-#endif //X86_ASM
-
-#if defined(__cplusplus)
-}
-#endif//__cplusplus
-
-#endif//MC_COMMON_H
--- a/codec/common/mc_luma.asm
+++ /dev/null
@@ -1,1164 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_luma.asm
-;*
-;*  Abstract
-;*      sse2 motion compensation
-;*
-;*  History
-;*      17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-SECTION .rodata align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10:
-	dw 16, 16, 16, 16
-ALIGN 16
-h264_w0x10_1:
-	dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
-	dw 32, 32, 32, 32, 32, 32, 32, 32
-
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-
-
-;*******************************************************************************
-; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
-;                       int iSrcStride,
-;						uint8_t *pDst,
-;						int iDstStride,
-;						int iHeight)
-;*******************************************************************************
-WELS_EXTERN McHorVer20WidthEq4_mmx
-    %assign  push_num 0
-    LOAD_5_PARA
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-
-	sub r0, 2
-	WELS_Zero mm7
-	movq mm6, [h264_w0x10]
-.height_loop:
-	movd mm0, [r0]
-	punpcklbw mm0, mm7
-	movd mm1, [r0+5]
-	punpcklbw mm1, mm7
-	movd mm2, [r0+1]
-	punpcklbw mm2, mm7
-	movd mm3, [r0+4]
-	punpcklbw mm3, mm7
-	movd mm4, [r0+2]
-	punpcklbw mm4, mm7
-	movd mm5, [r0+3]
-	punpcklbw mm5, mm7
-
-	paddw mm2, mm3
-	paddw mm4, mm5
-	psllw mm4, 2
-	psubw mm4, mm2
-	paddw mm0, mm1
-	paddw mm0, mm4
-	psllw mm4, 2
-	paddw mm0, mm4
-	paddw mm0, mm6
-	psraw mm0, 5
-	packuswb mm0, mm7
-	movd [r2], mm0
-
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .height_loop
-
-	WELSEMMS
-	LOAD_5_PARA_POP
-	ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-
-%macro SSE_LOAD_8P 3
-	movq %1, %3
-	punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
-	paddw	%1, %6
-	movdqa	%8, %3
-	movdqa	%7, %2
-	paddw	%1, [h264_w0x10_1]
-	paddw	%8, %4
-	paddw	%7, %5
-	psllw	%8, 2
-	psubw	%8, %7
-	paddw	%1, %8
-	psllw	%8, 2
-	paddw	%1, %8
-	psraw   %1, 5
-	WELS_Zero %8
-	packuswb %1, %8
-	movq    %9, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
-;                       int16_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride
-;						int32_t iHeight
-;                       )
-;***********************************************************************
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-	%assign  push_num 0
-    LOAD_5_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	pxor xmm7, xmm7
-
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
-
-.yloop_width_8:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
-
-	add r0, r1
-	add r2, r3
-	dec r4
-	jnz .yloop_width_8
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-;*******************************************************************************
-; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
-;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
-;                      );
-;*******************************************************************************
-WELS_EXTERN McHorVer20WidthEq8_sse2
-	%assign  push_num 0
-    LOAD_5_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	lea r0, [r0-2]            ;pSrc -= 2;
-
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
-
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
-
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-;*******************************************************************************
-; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
-;                       int iSrcStride,
-;												uint8_t *pDst,
-;												int iDstStride,
-;												int iHeight,
-;                      );
-;*******************************************************************************
-WELS_EXTERN McHorVer20WidthEq16_sse2
-	%assign  push_num 0
-    LOAD_5_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	lea r0, [r0-2]            ;pSrc -= 2;
-
-	pxor xmm7, xmm7
-	movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2], xmm0
-
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, xmm6
-	psraw xmm0, 5
-	packuswb xmm0, xmm7
-	movq [r2+8], xmm0
-
-	lea r2, [r2+r3]
-	lea r0, [r0+r1]
-	dec r4
-	jnz near .y_loop
-
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-
-;*******************************************************************************
-; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
-;                       int iSrcStride,
-;                       uint8_t *pDst,
-;                       int iDstStride,
-;                       int iHeight )
-;*******************************************************************************
-WELS_EXTERN McHorVer02WidthEq8_sse2
-	%assign  push_num 0
-    LOAD_5_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	sub r0, r1
-	sub r0, r1
-
-	WELS_Zero xmm7
-
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r4
-	jz near .xx_exit
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r4
-	jz near .xx_exit
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
-
-.xx_exit:
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-
-
-;***********************************************************************
-; void McHorVer02Height9Or17_sse2(	const uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;                       uint8_t *pDst,
-;                       int32_t iDstStride,
-;						int32_t iWidth,
-;                       int32_t iHeight )
-;***********************************************************************
-WELS_EXTERN McHorVer02Height9Or17_sse2
-	%assign  push_num 0
-    LOAD_6_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-
-%ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
-%endif
-
-	shr r4, 3
-	sub r0, r1
-	sub r0, r1
-
-.xloop:
-	WELS_Zero xmm7
-	SSE_LOAD_8P xmm0, xmm7, [r0]
-	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm7, [r0]
-	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm7, [r0]
-	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
-
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	movdqa xmm0,xmm1
-	movdqa xmm1,xmm2
-	movdqa xmm2,xmm3
-	movdqa xmm3,xmm4
-	movdqa xmm4,xmm5
-	movdqa xmm5,xmm6
-	add r2, r3
-	sub r0, r1
-
-.start:
-	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm6, xmm7, [r0]
-	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
-	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm0, xmm1, [r0]
-	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
-	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm2, xmm3, [r0]
-	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
-	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	SSE_LOAD_8P xmm4, xmm5, [r0]
-	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
-	jmp near .start
-
-.x_loop_dec:
-	dec r4
-	jz  near .xx_exit
-%ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
-%else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
-%endif
-	sub r0, r1
-	sub r0, r1
-	add r0, 8
-	add r2, 8
-	jmp near .xloop
-
-.xx_exit:
-%ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
-%endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
-
-
-;***********************************************************************
-; void McHorVer20Width9Or17_sse2(		const uint8_t *pSrc,
-;                       int32_t iSrcStride,
-;						uint8_t *pDst,
-;						int32_t iDstStride,
-;						int32_t iWidth,
-;						int32_t iHeight
-;                      );
-;***********************************************************************
-WELS_EXTERN McHorVer20Width9Or17_sse2
-	%assign  push_num 0
-    LOAD_6_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-	sub r0, 2
-	pxor xmm7, xmm7
-
-	cmp r4, 9
-	jne near .width_17
-
-.yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2], xmm0
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+1], xmm2
-
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
-
-
-.width_17:
-.yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movq [r2], xmm0
-
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	paddw xmm0, [h264_w0x10_1]
-	psraw  xmm0, 5
-	packuswb xmm0, xmm0
-	movd [r2+8], xmm0
-
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	paddw xmm2, [h264_w0x10_1]
-	psraw  xmm2, 5
-	packuswb xmm2, xmm2
-	movq [r2+9], xmm2
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
-
-
-
-;***********************************************************************
-;void McHorVer22HorFirst_sse2
-;							(const uint8_t *pSrc,
-;							int32_t iSrcStride,
-;							uint8_t * pTap,
-;							int32_t iTapStride,
-;							int32_t iWidth,int32_t iHeight);
-;***********************************************************************
-WELS_EXTERN McHorVer22HorFirst_sse2
-	%assign  push_num 0
-    LOAD_6_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-	pxor xmm7, xmm7
-	sub r0, r1				;;;;;;;;need more 5 lines.
-	sub r0, r1
-
-	cmp r4, 9
-	jne near .width_17
-
-.yloop_width_9:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2], xmm0
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+2], xmm2
-	movhps [r2+2+8], xmm2
-
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_9
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
-
-
-.width_17:
-.yloop_width_17:
-	movq xmm0, [r0]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3]
-	punpcklbw xmm5, xmm7
-
-	paddw xmm2, xmm3
-	paddw xmm4, xmm5
-	psllw xmm4, 2
-	psubw xmm4, xmm2
-	paddw xmm0, xmm1
-	paddw xmm0, xmm4
-	psllw xmm4, 2
-	paddw xmm0, xmm4
-	movdqa [r2], xmm0
-
-	movq xmm0, [r0+8]
-	punpcklbw xmm0, xmm7
-	movq xmm1, [r0+5+8]
-	punpcklbw xmm1, xmm7
-	movq xmm2, [r0+1+8]
-	punpcklbw xmm2, xmm7
-	movq xmm3, [r0+4+8]
-	punpcklbw xmm3, xmm7
-	movq xmm4, [r0+2+8]
-	punpcklbw xmm4, xmm7
-	movq xmm5, [r0+3+8]
-	punpcklbw xmm5, xmm7
-
-	movdqa xmm7, xmm2
-	paddw   xmm7, xmm3
-	movdqa xmm6, xmm4
-	paddw   xmm6, xmm5
-	psllw xmm6, 2
-	psubw xmm6, xmm7
-	paddw xmm0, xmm1
-	paddw xmm0, xmm6
-	psllw xmm6, 2
-	paddw xmm0, xmm6
-	movd [r2+16], xmm0
-
-
-	pxor  xmm7, xmm7
-	movq xmm0, [r0+6+8]
-	punpcklbw xmm0, xmm7
-
-	paddw xmm4, xmm1
-	paddw xmm5, xmm3
-	psllw xmm5, 2
-	psubw xmm5, xmm4
-	paddw xmm2, xmm0
-	paddw xmm2, xmm5
-	psllw xmm5, 2
-	paddw xmm2, xmm5
-	movq [r2+18], xmm2
-	movhps [r2+18+8], xmm2
-
-	add r0, r1
-	add r2, r3
-	dec r5
-	jnz .yloop_width_17
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
-
-
-%macro FILTER_VER 9
-	paddw  %1, %6
-	movdqa %7, %2
-	movdqa %8, %3
-
-
-	paddw %7, %5
-	paddw %8, %4
-
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %1, %8
-	psubw  %1, %7
-	psraw   %1, 2
-	paddw  %8, %1
-	paddw  %8, [h264_mc_hc_32]
-	psraw   %8, 6
-	packuswb %8, %8
-	movq %9, %8
-%endmacro
-;***********************************************************************
-;void McHorVer22Width8VerLastAlign_sse2(
-;											const uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
-WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
-	%assign  push_num 0
-    LOAD_6_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-%ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
-%endif
-
-	shr r4, 3
-
-.width_loop:
-	movdqa xmm0, [r0]
-	movdqa xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	movdqa xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	movdqa xmm5, [r0+r1]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add r2, r3
-	sub r0, r1
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqa xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqa xmm5, [r0+r1]
-	jmp near .start
-
-.x_loop_dec:
-	dec r4
-	jz near .exit
-%ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
-%else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
-%endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
-
-.exit:
-%ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
-%endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
-
-;***********************************************************************
-;void McHorVer22Width8VerLastUnAlign_sse2(
-;											const uint8_t *pTap,
-;											int32_t iTapStride,
-;											uint8_t * pDst,
-;											int32_t iDstStride,
-;											int32_t iWidth,
-;											int32_t iHeight);
-;***********************************************************************
-
-WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
-	%assign  push_num 0
-    LOAD_6_PARA
-    PUSH_XMM 8
-	SIGN_EXTENSION	r1, r1d
-	SIGN_EXTENSION	r3, r3d
-	SIGN_EXTENSION	r4, r4d
-	SIGN_EXTENSION	r5, r5d
-%ifndef X86_32
-	push r12
-	push r13
-	push r14
-	mov  r12, r0
-	mov	 r13, r2
-	mov	 r14, r5
-%endif
-	shr r4, 3
-
-.width_loop:
-	movdqu xmm0, [r0]
-	movdqu xmm1, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	movdqu xmm3, [r0+r1]
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	movdqu xmm5, [r0+r1]
-
-	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
-
-	movdqa xmm0, xmm1
-	movdqa xmm1, xmm2
-	movdqa xmm2, xmm3
-	movdqa xmm3, xmm4
-	movdqa xmm4, xmm5
-	movdqa xmm5, xmm6
-
-	add r2, r3
-	sub r0, r1
-
-.start:
-	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm6, [r0]
-	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm7, [r0+r1]
-	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm0, [r0]
-	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm1, [r0+r1]
-	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm2, [r0]
-	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm3, [r0+r1]
-	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r0, [r0+2*r1]
-	movdqu xmm4, [r0]
-	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
-	dec r5
-	jz near .x_loop_dec
-
-	lea r2, [r2+2*r3]
-	movdqu xmm5, [r0+r1]
-	jmp near .start
-
-.x_loop_dec:
-	dec r4
-	jz near .exit
-%ifdef X86_32
-	mov	r0, arg1
-	mov r2, arg3
-	mov r5, arg6
-%else
-	mov r0, r12
-	mov r2, r13
-	mov r5, r14
-%endif
-	add r0, 16
-	add r2, 8
-	jmp .width_loop
-
-.exit:
-%ifndef X86_32
-	pop r14
-	pop r13
-	pop r12
-%endif
-	POP_XMM
-	LOAD_6_PARA_POP
-	ret
--- a/codec/common/mc_neon.S
+++ /dev/null
@@ -1,2210 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#ifdef  HAVE_NEON
-.text
-#include "arm_arch_common_macro.S"
-
-#ifdef __APPLE__
-.macro	AVERAGE_TWO_8BITS
-//	{	// input:dst_d, src_d A and B; working: q13
-    vaddl.u8	q13, $2, $1
-    vrshrn.u16		$0, q13, #1
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-//	}
-.endm
-
-.macro	FILTER_SINGLE_TAG_8BITS		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2},
-    vrev64.8	$2, $0				// X[5][4][3][2][1][0]O
-    vaddl.u8	$3, $0, $2			// each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16	$0, $2, $1			// 0+1*[50]-5*[41]+20[32]
-    vpadd.s16	$0, $0, $0
-    vpadd.s16	$0, $0, $0
-    vqrshrun.s16	$0, $4, #5
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-    vaddl.u8	q13, $2, $6
-    vrshrn.u16		$6, q13, #1
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, $0, $5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	q12, q13, $7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	q12, q13, $8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		$6, q12, #5
-    vaddl.u8	q13, $3, $6
-    vrshrn.u16		$6, q13, #1
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_TO_16BITS
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8	$6, $0, $5		//dst_q=src[-2]+src[3]
-    vaddl.u8	q13, $2, $3	//src[0]+src[1]
-    vmla.u16	$6, q13, $7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, $1, $4	//src[-1]+src[2]
-    vmls.s16	$6, q13, $8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
-.endm
-
-.macro	FILTER_3_IN_16BITS_TO_8BITS
-//	{	// input:a, b, c, dst_d;
-    vsub.s16	$0, $0, $1			//a-b
-    vshr.s16	$0, $0, #2			//(a-b)/4
-    vsub.s16	$0, $0, $1			//(a-b)/4-b
-    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	$0, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	$3, $0, #6		//(+32)>>6
-//	}
-.endm
-
-.macro	UNPACK_2_16BITS_TO_ABC
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16	$4, $0, $1, #2		//src[0]
-    vext.16	$3, $0, $1, #3		//src[1]
-    vadd.s16	$4, $3					//c=src[0]+src[1]
-
-    vext.16	$3, $0, $1, #1		//src[-1]
-    vext.16	$2, $0, $1, #4		//src[2]
-    vadd.s16	$3, $2					//b=src[-1]+src[2]
-
-    vext.16	$2, $0, $1, #5		//src[3]
-    vadd.s16	$2, $0					//a=src[-2]+src[3]
-//	}
-.endm
-
-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16	$3, $3, $3, #7	// 0x????, [0][1][2][3][4][5],
-    vrev64.16	$1, $1
-    vadd.u16	$2, $1				// C[2+3],B[1+4],A[0+5],
-    vshr.s64	$1, $2, #16
-    vshr.s64	$0, $2, #32		// Output: C $2, B $1, A $0
-
-    vsub.s16	$0, $0, $1			//a-b
-    vshr.s16	$0, $0, #2			//(a-b)/4
-    vsub.s16	$0, $0, $1			//(a-b)/4-b
-    vadd.s16	$0, $0, $2			//(a-b)/4-b+c
-    vshr.s16	$0, $0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	$1, $0, $2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	$0, $3, #6		//(+32)>>6
-//	}
-.endm
-#else
-.macro	AVERAGE_TWO_8BITS arg0, arg1, arg2
-//	{	// input:dst_d, src_d A and B; working: q13
-    vaddl.u8	q13, \arg2, \arg1
-    vrshrn.u16		\arg0, q13, #1
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-//	}
-.endm
-
-.macro	FILTER_SINGLE_TAG_8BITS arg0, arg1,arg2, arg3, arg4,arg5		// when width=17/9, used
-//	{	// input: src_d{Y[0][1][2][3][4][5]X, the even of working_q2}
-    vrev64.8	\arg2, \arg0				// X[5][4][3][2][1][0]O
-    vaddl.u8	\arg3, \arg0, \arg2			// each 16bits, *[50][41][32][23][14][05]*
-    vmul.s16	\arg0, \arg2, \arg1			// 0+1*[50]-5*[41]+20[32]
-    vpadd.s16	\arg0, \arg0, \arg0
-    vpadd.s16	\arg0, \arg0, \arg0
-    vqrshrun.s16	\arg0, \arg4, #5
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-    vaddl.u8	q13, \arg2, \arg6
-    vrshrn.u16		\arg6, q13, #1
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_AVERAGE_WITH_1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: q12, q13
-    vaddl.u8	q12, \arg0, \arg5	//q12=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	q12, q13, \arg7	//q12 += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	q12, q13, \arg8	//q12 -= 5*(src[-1]+src[2]), 2 cycles
-    vqrshrun.s16		\arg6, q12, #5
-    vaddl.u8	q13, \arg3, \arg6
-    vrshrn.u16		\arg6, q13, #1
-//	}
-.endm
-
-.macro	FILTER_6TAG_8BITS_TO_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
-//	{	// input:d_src[-2], d_src[-1], d_src[0], d_src[1], d_src[2], d_src[3], dst_q, multiplier a/b; working:q13
-    vaddl.u8	\arg6, \arg0, \arg5		//dst_q=src[-2]+src[3]
-    vaddl.u8	q13, \arg2, \arg3	//src[0]+src[1]
-    vmla.u16	\arg6, q13, \arg7	//dst_q += 20*(src[0]+src[1]), 2 cycles
-    vaddl.u8	q13, \arg1, \arg4	//src[-1]+src[2]
-    vmls.s16	\arg6, q13, \arg8	//dst_q -= 5*(src[-1]+src[2]), 2 cycles
-//	}
-.endm
-
-.macro	FILTER_3_IN_16BITS_TO_8BITS arg0, arg1, arg2, arg3
-//	{	// input:a, b, c, dst_d;
-    vsub.s16	\arg0, \arg0, \arg1			//a-b
-    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	\arg0, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	\arg3, \arg0, #6		//(+32)>>6
-//	}
-.endm
-
-.macro	UNPACK_2_16BITS_TO_ABC arg0, arg1, arg2, arg3, arg4
-//	{	// input:q_src[-2:5], q_src[6:13](avail 8+5)/q_src[6:**](avail 4+5), dst_a, dst_b, dst_c;
-    vext.16	\arg4, \arg0, \arg1, #2		//src[0]
-    vext.16	\arg3, \arg0, \arg1, #3		//src[1]
-    vadd.s16	\arg4, \arg3					//c=src[0]+src[1]
-
-    vext.16	\arg3, \arg0, \arg1, #1		//src[-1]
-    vext.16	\arg2, \arg0, \arg1, #4		//src[2]
-    vadd.s16	\arg3,\arg2					//b=src[-1]+src[2]
-
-    vext.16	\arg2, \arg0, \arg1, #5		//src[3]
-    vadd.s16	\arg2, \arg0					//a=src[-2]+src[3]
-//	}
-.endm
-
-.macro	UNPACK_1_IN_8x16BITS_TO_8BITS arg0, arg1,arg2, arg3
-//	{	// each 16bits; input: d_dst, d_src[0:3] (even), d_src[4:5]+%% (odd)
-    vext.16	\arg3, \arg3, \arg3, #7	// 0x????, [0][1][2][3][4][5]
-    vrev64.16	\arg1, \arg1
-    vadd.u16	\arg2, \arg1				// C[2+3],B[1+4],A[0+5]
-    vshr.s64	\arg1, \arg2, #16
-    vshr.s64	\arg0, \arg2, #32		// Output: C \arg2, B \arg1, A \arg0
-
-    vsub.s16	\arg0, \arg0, \arg1			//a-b
-    vshr.s16	\arg0, \arg0, #2			//(a-b)/4
-    vsub.s16	\arg0, \arg0, \arg1			//(a-b)/4-b
-    vadd.s16	\arg0, \arg0, \arg2			//(a-b)/4-b+c
-    vshr.s16	\arg0, \arg0, #2			//((a-b)/4-b+c)/4
-    vadd.s16	\arg1, \arg0, \arg2			//((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
-    vqrshrun.s16	\arg0, \arg3, #6		//(+32)>>6
-//	}
-.endm
-#endif
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w16_h_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
-
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
-
-	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d2, q14, q15
-
-	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d3, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
-
-	cmp		r4, #0
-	bne		w16_h_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w8_h_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
-
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d1, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
-
-	cmp		r4, #0
-	bne		w8_h_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w4_h_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
-
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
-
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
-
-	FILTER_6TAG_8BITS 	d0, d4, d6, d7, d2, d5, d1, q14, q15
-
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_h_mc_luma_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w16_xy_10_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
-
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d16, d18, d20, d2, q14, q15
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d5, d7, d17, d19, d21, d3, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
-
-	cmp		r4, #0
-	bne		w16_xy_10_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w8_xy_10_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
-
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d3, d4, d5, d6, d1, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
-
-	cmp		r4, #0
-	bne		w8_xy_10_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer10WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w4_xy_10_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
-
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
-
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d4, d6, d7, d2, d5, d1, q14, q15
-
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_10_mc_luma_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w16_xy_30_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 21(16+5); q0=src[-2]
-	pld			[r0]
-	pld			[r0, #16]
-
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d16, d18, d20, d2, q14, q15
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d5, d7, d17, d19, d21, d3, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d2, d3}, [r2], r3		//write 16Byte
-
-	cmp		r4, #0
-	bne		w16_xy_30_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w8_xy_30_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 13(8+5); q0=src[-2]
-	pld			[r0]
-
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d3, d4, d5, d6, d1, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d1}, [r2], r3
-
-	cmp		r4, #0
-	bne		w8_xy_30_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer30WidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r6, [sp, #12]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w4_xy_30_mc_luma_loop:
-	vld1.u8	{d0, d1}, [r0], r1	//only use 9(4+5);d0: 1st row src[-2:5]
-	pld			[r0]
-	vld1.u8	{d2, d3}, [r0], r1	//d2: 2nd row src[-2:5]
-	pld			[r0]
-
-	vext.8		d4, d0, d1, #1		//d4: 1st row src[-1:6]
-	vext.8		d5, d2, d3, #1		//d5: 2nd row src[-1:6]
-	vext.8		q3, q2, q2, #1		//src[0:6 *]
-	vext.8		q8, q2, q2, #2		//src[1:6 * *]
-
-	vtrn.32	q3, q8					//q3::d6:1st row [0:3]+[1:4]; d7:2nd row [0:3]+[1:4]
-	vtrn.32	d6, d7					//d6:[0:3]; d7[1:4]
-	vtrn.32		d0, d2				//d0:[-2:1]; d2[2:5]
-	vtrn.32		d4, d5				//d4:[-1:2]; d5[3:6]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d4, d6, d7, d2, d5, d1, q14, q15
-
-	vmov		r4, r5, d1
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-
-	sub		r6, #2
-	cmp		r6, #0
-	bne		w4_xy_30_mc_luma_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
-
-w16_xy_01_luma_loop:
-
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
-
-	//q2, q3, q4, q5, q0 --> q0~q4
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q4
-
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_xy_01_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-
-w8_xy_01_mc_luma_loop:
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_xy_01_mc_luma_loop
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer01WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
-
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
-
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
-
-w4_xy_01_mc_luma_loop:
-
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
-
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_0 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
-
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
-
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_xy_01_mc_luma_loop
-
-	pop		{r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
-
-w16_xy_03_luma_loop:
-
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
-
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
-
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_xy_03_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-
-w8_xy_03_mc_luma_loop:
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_xy_03_mc_luma_loop
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer03WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
-
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
-
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
-
-w4_xy_03_mc_luma_loop:
-
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
-
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
-
-	FILTER_6TAG_8BITS_AVERAGE_WITH_1 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
-
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
-
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_xy_03_mc_luma_loop
-
-	pop		{r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
-
-w16_v_mc_luma_loop:
-
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
-
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
-
-	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
-
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
-
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
-
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
-
-	sub		r4, #8
-	cmp		r4, #0
-	bne		w16_v_mc_luma_loop
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-
-w8_v_mc_luma_loop:
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_v_mc_luma_loop
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02WidthEq4_neon
-	push		{r4, r5, r6, r7}
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	ldr		r4, [r0], r1		//r4=src[-2]
-	ldr		r5, [r0], r1		//r5=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	ldr		r6, [r0], r1		//r6=src[0]
-	ldr		r7, [r0], r1		//r7=src[1]
-
-	vmov		d0, r4, r5
-	vmov		d1, r5, r6
-	vmov		d2, r6, r7
-
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d3, r7, r4
-	ldr			r7, [sp, #16]
-
-w4_v_mc_luma_loop:
-
-//	pld			[r0]
-	//using reserving r4
-	ldr		r5, [r0], r1		//r5=src[3]
-	ldr		r6, [r0], r1		//r6=src[0]
-	vmov		d4, r4, r5
-	vmov		d5, r5, r6			//reserved r6
-
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vmov		r4, r5, d16
-	str	r4, [r2], r3			//write 1st 4Byte
-	str	r5, [r2], r3			//write 2nd 4Byte
-
-	ldr		r5, [r0], r1		//r5=src[1]
-	ldr		r4, [r0], r1		//r4=src[2]
-	vmov		d0, r6, r5
-	vmov		d1, r5, r4			//reserved r4
-
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vmov		r5, r6, d16
-	str	r5, [r2], r3			//write 3rd 4Byte
-	str	r6, [r2], r3			//write 4th 4Byte
-
-	//d4, d5, d0, d1 --> d0, d1, d2, d3
-	vmov	q1, q0
-	vmov	q0, q2
-
-	sub		r7, #4
-	cmp		r7, #0
-	bne		w4_v_mc_luma_loop
-
-	pop		{r4, r5, r6, r7}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq16_neon
-	push		{r4}
-	vpush		{q4-q7}
-	ldr			r4, [sp, #68]
-
-	sub			r0, #2					//src[-2]
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(16+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(16+5), =src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(16+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(16+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(16+5), =src[2]
-
-w16_hv_mc_luma_loop:
-
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(16+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2], r3		//write 16Byte
-
-
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-
-	vst1.u8	{d3, d4}, [r2], r3		//write 16Byte
-
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
-
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2], r3		//write 16Byte
-
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
-
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2
-	vmov	q2, q8
-
-	vmov	d20,d8
-	vmov	q4, q1
-	vmov	q1, q7
-	vmov	d14,d20
-
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w16_hv_mc_luma_loop
-	vpop		{q4-q7}
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq8_neon
-	push		{r4}
-	vpush		{q4}
-	ldr			r4, [sp, #20]
-
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 13(8+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 13(8+5), =src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
-
-	vld1.u8	{q2}, [r0], r1	//use 13(8+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 13(8+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 13(8+5), =src[2]
-
-w8_hv_mc_luma_loop:
-
-	vld1.u8	{q8}, [r0], r1	//use 13(8+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
-
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3		//write 8Byte
-
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
-
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 5 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2], r3			//write 8Byte
-
-	//q4~q5, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1
-	vmov	q1, q8
-
-	sub		r4, #4
-	cmp		r4, #0
-	bne		w8_hv_mc_luma_loop
-	vpop		{q4}
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22WidthEq4_neon
-	push		{r4 ,r5, r6}
-	vpush		{q4-q7}
-	ldr			r6, [sp, #76]
-
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
-
-	vld1.u8	{q2}, [r0], r1	//use 9(4+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 9(4+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 9(4+5), =src[2]
-
-w4_hv_mc_luma_loop:
-
-	vld1.u8	{q5}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q6}, [r0], r1	//use 9(4+5), =src[4]
-
-	//the 1st&2nd row
-	pld			[r0]
-	pld			[r0, r1]
-	// vertical filtered
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d10, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d11, q8, q14, q15	// 1 avail
-
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8,d10, d12, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9,d11, d13,q10, q14, q15	// 1 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q0, q7, q8		//4 avail
-
-	vmov	d23, d0
-	vmov	d25, d14
-	vmov	d27, d16
-
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
-
-	//the 3rd&4th row
-	vld1.u8	{q0}, [r0], r1	//use 9(4+5), =src[3]
-	vld1.u8	{q1}, [r0], r1	//use 9(4+5), =src[4]
-	pld			[r0]
-	pld			[r0, r1]
-	// vertical filtered
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d10, d12, d0, q7, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d11, d13, d1, q8, q14, q15	// 1 avail
-
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8,d10, d12, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9,d11, d13, d1, d3,q10, q14, q15	// 1 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q7, q8, q11, q12, q13	//4 avail
-	UNPACK_2_16BITS_TO_ABC	q9,q10, q2, q7, q8		//4 avail
-
-	vmov	d23, d4
-	vmov	d25, d14
-	vmov	d27, d16
-
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d22	//output to q11[0]
-	vmov		r4 ,r5, d22
-	str		r4, [r2], r3				//write 4Byte
-	str		r5, [r2], r3				//write 4Byte
-
-	//q4~q6, q0~q1, --> q0~q4
-	vswp	q4, q0
-	vmov	q3, q4
-	vmov	q4, q1
-	vmov	q1, q5
-	vmov	q2, q6
-
-	sub		r6, #4
-	cmp		r6, #0
-	bne		w4_hv_mc_luma_loop
-
-	vpop		{q4-q7}
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-w16_copy_loop:
-	vld1.u8		{q0}, [r0], r1
-	sub			r4, #2
-	vld1.u8		{q1}, [r0], r1
-	vst1.u8		{q0}, [r2], r3
-	cmp			r4, #0
-	vst1.u8		{q1}, [r2], r3
-	bne			w16_copy_loop
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq8_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-w8_copy_loop:
-	vld1.u8		{d0}, [r0], r1
-	vld1.u8		{d1}, [r0], r1
-	vst1.u8		{d0}, [r2], r3
-	vst1.u8		{d1}, [r2], r3
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w8_copy_loop
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McCopyWidthEq4_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-w4_copy_loop:
-	ldr		r5, [r0], r1
-	ldr		r6, [r0], r1
-	str		r5, [r2], r3
-	str		r6, [r2], r3
-
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_copy_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq16_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2]!
-	vld1.u8		{q1}, [r3]!
-	vld1.u8		{q2}, [r2]!
-	vld1.u8		{q3}, [r3]!
-
-	vld1.u8		{q8}, [r2]!
-	vld1.u8		{q9}, [r3]!
-	vld1.u8		{q10}, [r2]!
-	vld1.u8		{q11}, [r3]!
-
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
-
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
-
-	AVERAGE_TWO_8BITS		d16, d16, d18
-	AVERAGE_TWO_8BITS		d17, d17, d19
-	vst1.u8		{q8}, [r0], r1
-
-	AVERAGE_TWO_8BITS		d20, d20, d22
-	AVERAGE_TWO_8BITS		d21, d21, d23
-	vst1.u8		{q10}, [r0], r1
-
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w16_pix_avg_loop
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq8_neon
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	mov			r5, #16
-w8_pix_avg_loop:
-
-	vld1.u8		{d0}, [r2], r5
-	vld1.u8		{d2}, [r3], r5
-	vld1.u8		{d1}, [r2], r5
-	vld1.u8		{d3}, [r3], r5
-
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
-
-	vld1.u8		{d4}, [r2], r5
-	vld1.u8		{d6}, [r3], r5
-	vld1.u8		{d5}, [r2], r5
-	vld1.u8		{d7}, [r3], r5
-
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1
-
-	sub			r4, #4
-	cmp			r4, #0
-	bne			w8_pix_avg_loop
-
-	pop		{r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixelAvgWidthEq4_neon
-	push		{r4-r8}
-	ldr			r4, [sp, #20]
-w4_pix_avg_loop:
-
-	ldr		r5, [r2]
-	ldr		r6, [r2, #16]
-	ldr		r7, [r3]
-	ldr		r8, [r3, #16]
-	add		r2, #32
-	add		r3, #32
-
-	vmov		d0, r5, r6
-	vmov		d1, r7, r8
-	AVERAGE_TWO_8BITS		d0, d0, d1
-	vmov		r5, r6, d0
-
-	str		r5, [r0], r1
-	str		r6, [r0], r1
-
-	sub			r4, #2
-	cmp			r4, #0
-	bne			w4_pix_avg_loop
-
-	pop		{r4-r8}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN McChromaWidthEq8_neon
-	push		{r4, r5}
-	ldr			r4, [sp, #8]
-	ldr			r5, [sp, #12]
-//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-//	we can opti it by adding vert only/ hori only cases, to be continue
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
-	vld1.u8		{q0}, [r0], r1	//src[x]
-
-	vdup.u8	d28, d31[0]			//A
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C
-	vdup.u8	d31, d31[3]			//D
-
-	vext.u8		d1, d0, d1, #1		//src[x+1]
-
-w8_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{q1}, [r0], r1	//src[x+stride]
-	vld1.u8		{q2}, [r0], r1	//src[x+2*stride]
-	vext.u8		d3, d2, d3, #1		//src[x+stride+1]
-	vext.u8		d5, d4, d5, #1		//src[x+2*stride+1]
-
-	vmull.u8		q3, d0, d28			//(src[x] * A)
-	vmlal.u8		q3, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d2, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d3, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
-
-	vmull.u8		q3, d2, d28			//(src[x] * A)
-	vmlal.u8		q3, d3, d29			//+=(src[x+1] * B)
-	vmlal.u8		q3, d4, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q3, d5, d31			//+=(src[x+stride+1] * D)
-	vrshrn.u16		d6, q3, #6
-	vst1.u8	d6, [r2], r3
-
-	vmov		q0, q2
-	sub			r5, #2
-	cmp			r5, #0
-	bne			w8_mc_chroma_loop
-
-	pop		{r4, r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McChromaWidthEq4_neon
-
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r6, [sp, #16]
-//	normal case: {cA*src[x]  + cB*src[x+1]} + {cC*src[x+stride] + cD*srcp[x+stride+1]}
-//	we can opti it by adding vert only/ hori only cases, to be continue
-	vld1.u8	{d31}, [r4]		//load A/B/C/D
-
-	vdup.u8	d28, d31[0]			//A
-	vdup.u8	d29, d31[1]			//B
-	vdup.u8	d30, d31[2]			//C
-	vdup.u8	d31, d31[3]			//D
-
-w4_mc_chroma_loop:	// each two pxl row
-	vld1.u8		{d0}, [r0], r1	//a::src[x]
-	vld1.u8		{d2}, [r0], r1	//b::src[x+stride]
-	vld1.u8		{d4}, [r0]			//c::src[x+2*stride]
-
-	vshr.u64		d1, d0, #8
-	vshr.u64		d3, d2, #8
-	vshr.u64		d5, d4, #8
-
-	vmov			q3, q1				//b::[0:7]+b::[1~8]
-	vtrn.32		q0, q1				//d0{a::[0:3]+b::[0:3]}; d1{a::[1:4]+b::[1:4]}
-	vtrn.32		q3, q2				//d6{b::[0:3]+c::[0:3]}; d7{b::[1:4]+c::[1:4]}
-
-	vmull.u8		q1, d0, d28			//(src[x] * A)
-	vmlal.u8		q1, d1, d29			//+=(src[x+1] * B)
-	vmlal.u8		q1, d6, d30			//+=(src[x+stride] * C)
-	vmlal.u8		q1, d7, d31			//+=(src[x+stride+1] * D)
-
-	vrshrn.u16		d2, q1, #6
-	vmov		r4, r5, d2
-	str	r4, [r2], r3
-	str	r5, [r2], r3
-
-	sub			r6, #2
-	cmp			r6, #0
-	bne			w4_mc_chroma_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-WELS_ASM_FUNC_BEGIN McHorVer20Width17_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d3, r5, r4					// 0x0014FFFB00010000
-
-	sub			r3, #16
-	ldr			r4, [sp, #8]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w17_h_mc_luma_loop:
-	vld1.u8	{d0,d1,d2}, [r0], r1	//only use 22(17+5); q0=src[-2]
-
-	vext.8		q2, q0, q1, #1		//q2=src[-1]
-	vext.8		q3, q0, q1, #2		//q3=src[0]
-	vext.8		q8, q0, q1, #3		//q8=src[1]
-	vext.8		q9, q0, q1, #4		//q9=src[2]
-	vext.8		q10, q0, q1, #5		//q10=src[3]
-
-	FILTER_6TAG_8BITS 	d0, d4, d6, d16, d18, d20, d22, q14, q15
-
-	FILTER_6TAG_8BITS 	d1, d5, d7, d17, d19, d21, d23, q14, q15
-
-	vst1.u8	{d22, d23}, [r2]!		//write [0:15] Byte
-
-	vsli.64	d2, d2, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d3, d22, q11, q1
-
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
-
-	sub		r4, #1
-	cmp		r4, #0
-	bne		w17_h_mc_luma_loop
-	pop		{r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer20Width9_neon
-	push		{r4-r5}
-	mov			r4, #20
-	mov			r5, #1
-	sub			r4, r4, r4, lsl #(16-2)
-	lsl			r5, #16
-	ror			r4, #16
-	vmov		d7, r5, r4					// 0x0014FFFB00010000
-
-	sub			r3, #8
-	ldr			r4, [sp, #8]
-
-	sub			r0, #2
-	vmov.u16	q14, #0x0014				// 20
-	vshr.u16	q15, q14, #2				// 5
-
-w9_h_mc_luma_loop:
-	vld1.u8	{d0,d1}, [r0], r1	//only use 14(9+5); q0=src[-2]
-	pld			[r0]
-
-	vext.8		d2, d0, d1, #1		//d2=src[-1]
-	vext.8		d3, d0, d1, #2		//d3=src[0]
-	vext.8		d4, d0, d1, #3		//d4=src[1]
-	vext.8		d5, d0, d1, #4		//d5=src[2]
-	vext.8		d6, d0, d1, #5		//d6=src[3]
-
-	FILTER_6TAG_8BITS 	d0, d2, d3, d4, d5, d6, d16, q14, q15
-
-	sub		r4, #1
-	vst1.u8	{d16}, [r2]!		//write [0:7] Byte
-
-	vsli.64	d2, d1, #8				// [0][1][2][3][4][5]XO-->O[0][1][2][3][4][5]X
-	FILTER_SINGLE_TAG_8BITS	d2, d7, d18, q9, q1
-	vst1.u8	{d2[0]}, [r2], r3		//write 8th Byte
-
-	cmp		r4, #0
-	bne		w9_h_mc_luma_loop
-	pop		{r4-r5}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02Height17_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{q0}, [r0], r1		//q0=src[-2]
-	vld1.u8	{q1}, [r0], r1		//q1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{q2}, [r0], r1		//q2=src[0]
-	vld1.u8	{q3}, [r0], r1		//q3=src[1]
-	vld1.u8	{q8}, [r0], r1		//q8=src[2]
-
-w17_v_mc_luma_loop:
-
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
-
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	vst1.u8	{q10}, [r2], r3			//write 2nd 16Byte
-
-	FILTER_6TAG_8BITS 	d4, d6, d16, d18, d0, d2, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d5, d7, d17, d19, d1, d3, d21, q14, q15
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	vst1.u8	{q10}, [r2], r3			//write 3rd 16Byte
-
-	FILTER_6TAG_8BITS 	d6, d16, d18, d0, d2, d4, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d7, d17, d19, d1, d3, d5, d21, q14, q15
-	vld1.u8	{q3}, [r0], r1		//read 5th row
-	vst1.u8	{q10}, [r2], r3			//write 4th 16Byte
-
-	FILTER_6TAG_8BITS 	d16, d18, d0, d2, d4, d6, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d17, d19, d1, d3, d5, d7, d21, q14, q15
-	vld1.u8	{q8}, [r0], r1		//read 6th row
-	vst1.u8	{q10}, [r2], r3			//write 5th 16Byte
-
-	FILTER_6TAG_8BITS 	d18, d0, d2, d4, d6, d16, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d19, d1, d3, d5, d7, d17, d21, q14, q15
-	vld1.u8	{q9}, [r0], r1		//read 7th row
-	vst1.u8	{q10}, [r2], r3			//write 6th 16Byte
-
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vld1.u8	{q0}, [r0], r1		//read 8th row
-	vst1.u8	{q10}, [r2], r3			//write 7th 16Byte
-
-	FILTER_6TAG_8BITS 	d2, d4, d6, d16, d18, d0, d20, q14, q15
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d5, d7, d17, d19, d1, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 8th 16Byte
-
-	//q2, q3, q8, q9, q0 --> q0~q8
-	vswp	q0, q8
-	vswp	q0, q2
-	vmov	q1, q3
-	vmov	q3, q9						//q0~q8
-
-	sub		r4, #8
-	cmp		r4, #1
-	bne		w17_v_mc_luma_loop
-	// the last 16Bytes
-	vld1.u8	{q9}, [r0], r1		//q9=src[3]
-	FILTER_6TAG_8BITS 	d0, d2, d4, d6, d16, d18, d20, q14, q15
-	FILTER_6TAG_8BITS 	d1, d3, d5, d7, d17, d19, d21, q14, q15
-	vst1.u8	{q10}, [r2], r3			//write 1st 16Byte
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer02Height9_neon
-	push		{r4}
-	ldr			r4, [sp, #4]
-
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride]
-	pld			[r0]
-	pld			[r0, r1]
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0}, [r0], r1		//d0=src[-2]
-	vld1.u8	{d1}, [r0], r1		//d1=src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-	vld1.u8	{d2}, [r0], r1		//d2=src[0]
-	vld1.u8	{d3}, [r0], r1		//d3=src[1]
-
-	vld1.u8	{d4}, [r0], r1		//d4=src[2]
-	vld1.u8	{d5}, [r0], r1		//d5=src[3]
-
-w9_v_mc_luma_loop:
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vld1.u8	{d0}, [r0], r1		//read 2nd row
-	vst1.u8	{d16}, [r2], r3		//write 1st 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d1, d2, d3, d4, d5, d0, d16, q14, q15
-	vld1.u8	{d1}, [r0], r1		//read 3rd row
-	vst1.u8	{d16}, [r2], r3		//write 2nd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d2, d3, d4, d5, d0, d1, d16, q14, q15
-	vld1.u8	{d2}, [r0], r1		//read 4th row
-	vst1.u8	{d16}, [r2], r3		//write 3rd 8Byte
-
-	pld			[r0]
-	FILTER_6TAG_8BITS 	d3, d4, d5, d0, d1, d2, d16, q14, q15
-	vld1.u8	{d3}, [r0], r1		//read 5th row
-	vst1.u8	{d16}, [r2], r3		//write 4th 8Byte
-
-	//d4, d5, d0, d1, d2, d3 --> d0, d1, d2, d3, d4, d5
-	vswp	q0, q2
-	vswp	q1, q2
-
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w9_v_mc_luma_loop
-
-	FILTER_6TAG_8BITS 	d0, d1, d2, d3, d4, d5, d16, q14, q15
-	vst1.u8	{d16}, [r2], r3		//write last 8Byte
-
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22Width17_neon
-	push		{r4}
-	vpush		{q4-q7}
-	ldr			r4, [sp, #68]
-
-	sub			r0, #2					//src[-2]
-	sub			r0, r0, r1, lsl #1		//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-
-	vmov.u16	q14, #0x0014			// 20
-	vld1.u8	{d0-d2}, [r0], r1		//use 21(17+5), =src[-2]
-	vld1.u8	{d3-d5}, [r0], r1		//use 21(17+5), =src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2			// 5
-
-	vld1.u8	{d6-d8}, [r0], r1		//use 21(17+5), =src[0]
-	vld1.u8	{d9-d11}, [r0], r1	//use 21(17+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{d12-d14}, [r0], r1	//use 21(17+5), =src[2]
-	sub			r3, #16
-
-w17_hv_mc_luma_loop:
-
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{d0, d1}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
-
-	vld1.u8	{d0-d2}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d6, d9, d12, d15, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d7,d10, d13, d16, d1,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d3	//output to d3
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d8,d11, d14, d17, d2,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d4	//output to d4
-	vst1.u8	{d3, d4}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d5, d22, d23, q11 //output to d5[0]
-	vst1.u8	{d5[0]}, [r2], r3		//write 16th Byte
-
-	vld1.u8	{d3-d5}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d9, d12, d15, d0, d3, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7,d10, d13, d16, d1, d4,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d6	//output to d6
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d8,d11, d14, d17, d2, d5,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d7	//output to d7
-	vst1.u8	{d6, d7}, [r2]!		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d8, d22, d23, q11 //output to d8[0]
-	vst1.u8	{d8[0]}, [r2], r3		//write 16th Byte
-
-	vld1.u8	{d6-d8}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	 d9, d12, d15, d0, d3, d6, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS		d10, d13, d16, d1, d4, d7,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d9	//output to d9
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d11, d14, d17, d2, d5, d8,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d10	//output to d10
-	vst1.u8	{d9, d10}, [r2], r3		//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d11, d22, d23, q11 //output to d11[0]
-	vst1.u8	{d11[0]}, [r2], r3		//write 16th Byte
-
-	//d12~d17(q6~q8), d0~d8(q0~q3+d8), --> d0~d14
-	vswp	q0, q6
-	vswp	q6, q3
-	vmov	q5, q2
-	vmov	q2, q8
-
-	vmov	d20,d8
-	vmov	q4, q1
-	vmov	q1, q7
-	vmov	d14,d20
-
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w17_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{d15-d17}, [r0], r1	//use 21(17+5), =src[3]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d3, d6, d9, d12, d15, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d4, d7,d10, d13, d16,q10, q14, q15	// 8 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d0	//output to q0[0]
-	// vertical filtered into q10/q11
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d5, d8,d11, d14, d17,q11, q14, q15	// only 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q10, q11, q9, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q9, q12, q13, d1	//output to q0[1]
-	vst1.u8	{q0}, [r2]!			//write 16Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d2, d22, d23, q11 //output to d2[0]
-	vst1.u8	{d2[0]}, [r2], r3		//write 16th Byte
-
-	vpop		{q4-q7}
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN McHorVer22Width9_neon
-	push		{r4}
-	vpush		{q4}
-	ldr			r4, [sp, #20]
-
-	sub			r0, #2				//src[-2]
-	sub			r0, r0, r1, lsl #1	//src[-2*src_stride-2]
-	pld			[r0]
-	pld			[r0, r1]
-
-	vmov.u16	q14, #0x0014		// 20
-	vld1.u8	{q0}, [r0], r1	//use 14(9+5), =src[-2]
-	vld1.u8	{q1}, [r0], r1	//use 14(9+5), =src[-1]
-
-	pld			[r0]
-	pld			[r0, r1]
-	vshr.u16	q15, q14, #2		// 5
-
-	vld1.u8	{q2}, [r0], r1	//use 14(9+5), =src[0]
-	vld1.u8	{q3}, [r0], r1	//use 14(9+5), =src[1]
-	pld			[r0]
-	pld			[r0, r1]
-	vld1.u8	{q4}, [r0], r1	//use 14(9+5), =src[2]
-	sub			r3, #8
-
-w9_hv_mc_luma_loop:
-
-	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
-	//the 1st row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-
-	vld1.u8	{q0}, [r0], r1		//read 2nd row
-	//the 2nd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d2, d4, d6, d8, d16, d0, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d3, d5, d7, d9, d17, d1, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-
-	vld1.u8	{q1}, [r0], r1		//read 3rd row
-	//the 3rd row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d4, d6, d8, d16, d0, d2, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d5, d7, d9, d17, d1, d3, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-
-	vld1.u8	{q2}, [r0], r1		//read 4th row
-	//the 4th row
-	pld			[r0]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d6, d8, d16, d0, d2, d4, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d7, d9, d17, d1, d3, d5, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!			//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-
-	//q4~q8, q0~q2, --> q0~q4
-	vswp	q0, q4
-	vswp	q2, q4
-	vmov	q3, q1
-	vmov	q1, q8
-
-	sub		r4, #4
-	cmp		r4, #1
-	bne		w9_hv_mc_luma_loop
-	//the last row
-	vld1.u8	{q8}, [r0], r1	//use 14(9+5), =src[3]
-	// vertical filtered into q9/q10
-	FILTER_6TAG_8BITS_TO_16BITS 	d0, d2, d4, d6, d8, d16, q9, q14, q15	// 8 avail
-	FILTER_6TAG_8BITS_TO_16BITS 	d1, d3, d5, d7, d9, d17, q10, q14, q15	// 6 avail
-	// horizon filtered
-	UNPACK_2_16BITS_TO_ABC	q9, q10, q11, q12, q13
-	FILTER_3_IN_16BITS_TO_8BITS q11, q12, q13, d18	//output to q9[0]
-	vst1.u8	d18, [r2]!				//write 8Byte
-	UNPACK_1_IN_8x16BITS_TO_8BITS	d19, d20, d21, q10 //output to d19[0]
-	vst1.u8	{d19[0]}, [r2], r3	//write 8th Byte
-	vpop		{q4}
-	pop		{r4}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq16_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]
-
-enc_w16_pix_avg_loop:
-	vld1.u8		{q0}, [r2], r3
-	vld1.u8		{q1}, [r4], r5
-	vld1.u8		{q2}, [r2], r3
-	vld1.u8		{q3}, [r4], r5
-
-	vld1.u8		{q8}, [r2], r3
-	vld1.u8		{q9}, [r4], r5
-	vld1.u8		{q10}, [r2], r3
-	vld1.u8		{q11}, [r4], r5
-
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{q0}, [r0], r1
-
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{q2}, [r0], r1
-
-	AVERAGE_TWO_8BITS		d16, d16, d18
-	AVERAGE_TWO_8BITS		d17, d17, d19
-	vst1.u8		{q8}, [r0], r1
-
-	AVERAGE_TWO_8BITS		d20, d20, d22
-	AVERAGE_TWO_8BITS		d21, d21, d23
-	vst1.u8		{q10}, [r0], r1
-
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w16_pix_avg_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-
-WELS_ASM_FUNC_BEGIN PixStrideAvgWidthEq8_neon
-	push		{r4, r5, r6}
-	ldr			r4, [sp, #12]
-	ldr			r5, [sp, #16]
-	ldr			r6, [sp, #20]
-enc_w8_pix_avg_loop:
-
-	vld1.u8		{d0}, [r2], r3
-	vld1.u8		{d2}, [r4], r5
-	vld1.u8		{d1}, [r2], r3
-	vld1.u8		{d3}, [r4], r5
-
-	AVERAGE_TWO_8BITS		d0, d0, d2
-	AVERAGE_TWO_8BITS		d1, d1, d3
-	vst1.u8		{d0}, [r0], r1
-	vst1.u8		{d1}, [r0], r1
-
-	vld1.u8		{d4}, [r2], r3
-	vld1.u8		{d6}, [r4], r5
-	vld1.u8		{d5}, [r2], r3
-	vld1.u8		{d7}, [r4], r5
-
-	AVERAGE_TWO_8BITS		d4, d4, d6
-	AVERAGE_TWO_8BITS		d5, d5, d7
-	vst1.u8		{d4}, [r0], r1
-	vst1.u8		{d5}, [r0], r1
-
-	sub			r6, #4
-	cmp			r6, #0
-	bne			enc_w8_pix_avg_loop
-
-	pop		{r4, r5, r6}
-WELS_ASM_FUNC_END
-
-#endif
--- a/codec/common/measure_time.h
+++ /dev/null
@@ -1,89 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2009-2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file	measure_time.h
- *
- * \brief	time cost measure utilization
- *
- * \date	04/28/2009 Created
- *
- *************************************************************************************
- */
-#ifndef WELS_TIME_COST_MEASURE_UTIL_H__
-#define WELS_TIME_COST_MEASURE_UTIL_H__
-
-#include <stdlib.h>
-
-#include "typedefs.h"
-#ifndef _WIN32
-#include <sys/time.h>
-#else
-#include <windows.h>
-#include <sys/timeb.h>
-#endif
-#include <time.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif//__cplusplus
-
-/*!
- * \brief	time cost measure utilization
- * \param	void
- * \return	time elapsed since run (unit: microsecond)
- */
-
-static inline int64_t WelsTime (void) {
-#ifndef _WIN32
-  struct timeval tv_date;
-
-  gettimeofday (&tv_date, NULL);
-  return ((int64_t) tv_date.tv_sec * 1000000 + (int64_t) tv_date.tv_usec);
-#else
-  static int64_t iMtimeFreq = 0;
-  int64_t iMtimeCur = 0;
-  int64_t iResult = 0;
-  if (!iMtimeFreq) {
-    QueryPerformanceFrequency ((LARGE_INTEGER*)&iMtimeFreq);
-    if (!iMtimeFreq)
-      iMtimeFreq = 1;
-  }
-  QueryPerformanceCounter ((LARGE_INTEGER*)&iMtimeCur);
-  iResult = (int64_t) ((double)iMtimeCur * 1e6 / (double)iMtimeFreq + 0.5);
-  return iResult;
-#endif//WIN32
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif//WELS_TIME_COST_MEASURE_UTIL_H__
--- a/codec/common/satd_sad.asm
+++ /dev/null
@@ -1,2184 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  satd_sad.asm
-;*
-;*  Abstract
-;*      WelsSampleSatd4x4_sse2
-;*      WelsSampleSatd8x8_sse2
-;*      WelsSampleSatd16x8_sse2
-;*      WelsSampleSatd8x16_sse2
-;*      WelsSampleSatd16x16_sse2
-;*
-;*      WelsSampleSad16x8_sse2
-;*      WelsSampleSad16x16_sse2
-;*
-;*  History
-;*      8/5/2009 Created
-;*     24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1:  dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2:  dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
-      pxor %1, %1
-      pcmpeqw %2, %2
-      psubw %1, %2
-%endmacro
-
-%macro  SSE2_SumWHorizon1 2
-	movdqa      %2, %1
-	psrldq      %2, 8
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 4
-	paddusw     %1, %2
-	movdqa      %2, %1
-	psrldq      %2, 2
-	paddusw     %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
-   SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5
-   SSE2_SumSub %2, %4, %5
-   SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
-	WELS_AbsW %1, %3
-	WELS_AbsW %2, %3
-	WELS_AbsW %4, %6
-	WELS_AbsW %5, %6
-	paddusw       %1, %2
-	paddusw       %4, %5
-	paddusw       %7, %1
-	paddusw       %7, %4
-%endmacro
-
-%macro  SSE2_SumWHorizon 3
-	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
-	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
-	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
-	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
-	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
-	lea					r0,    [r0+2*r1]
-    lea					r2,    [r2+2*r3]
-	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
-	lea                 r0, [r0+2*r1]
-	lea                 r2, [r2+2*r3]
-	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
-	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
-
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-    movd      xmm0, [r0]
-    movd      xmm1, [r0+r1]
-    lea       r0 , [r0+2*r1]
-    movd      xmm2, [r0]
-    movd      xmm3, [r0+r1]
-    punpckldq xmm0, xmm2
-    punpckldq xmm1, xmm3
-
-    movd      xmm4, [r2]
-    movd      xmm5, [r2+r3]
-    lea       r2 , [r2+2*r3]
-    movd      xmm6, [r2]
-    movd      xmm7, [r2+r3]
-    punpckldq xmm4, xmm6
-    punpckldq xmm5, xmm7
-
-    pxor      xmm6, xmm6
-    punpcklbw xmm0, xmm6
-    punpcklbw xmm1, xmm6
-    punpcklbw xmm4, xmm6
-    punpcklbw xmm5, xmm6
-
-    psubw     xmm0, xmm4
-    psubw     xmm1, xmm5
-
-    movdqa    xmm2, xmm0
-    paddw     xmm0, xmm1
-    psubw     xmm2, xmm1
-    SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
-    movdqa     xmm4, xmm0
-    paddw      xmm0, xmm3
-    psubw      xmm4, xmm3
-
-    movdqa         xmm2, xmm0
-    punpcklwd      xmm0, xmm4
-    punpckhwd      xmm4, xmm2
-
-	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
-	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
-
-    movdqa         xmm7, xmm0
-    paddw          xmm0, xmm5
-    psubw          xmm7, xmm5
-
-	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
-
-    movdqa         xmm2, xmm0
-    paddw          xmm0, xmm1
-    psubw          xmm2, xmm1
-
-    WELS_AbsW  xmm0, xmm3
-    paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4
-    paddusw        xmm6, xmm2
-    SSE2_SumWHorizon1  xmm6, xmm4
-	movd           retrd,  xmm6
-    and            retrd,  0xffff
-    shr            retrd,  1
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-    SSE2_GetSatd8x8
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse2
-	 %assign  push_num 0
-	 LOAD_4_PARA
-	 PUSH_XMM 8
-	 SIGN_EXTENSION r1, r1d
-	 SIGN_EXTENSION r3, r3d
-	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7
-
-	 SSE2_GetSatd8x8
-     lea    r0,    [r0+2*r1]
-     lea    r2,    [r2+2*r3]
-	 SSE2_GetSatd8x8
-
-	 psrlw   xmm6,  1
-	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	 movd    retrd,   xmm6
-	 POP_XMM
-	 LOAD_4_PARA_POP
-	 ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-    add    r0,    8
-    add    r2,    8
-	SSE2_GetSatd8x8
-
-	psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push r0
-	push r2
-	pxor   xmm6,   xmm6
-    pxor   xmm7,   xmm7
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
-	pop r2
-	pop r0
-	add    r0,    8
-	add    r2,    8
-
-	SSE2_GetSatd8x8
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
-    psrlw   xmm6,  1
-	SSE2_SumWHorizon   xmm6,xmm4,xmm7
-	movd    retrd,   xmm6
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-	paddd        xmm4, %1 ;for dc
-	paddd        xmm4, %3 ;for dc
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
-	pmaddubsw    %1, xmm5
-	movdqa       %2, %1
-	pmaddwd      %1, xmm7
-	pmaddwd      %2, xmm6
-	movdqa       %3, %1
-	punpckldq    %1, %2
-	punpckhdq    %2, %3
-	movdqa       %3, %1
-	punpcklqdq   %1, %2
-	punpckhqdq   %3, %2
-;    paddd        xmm4, %1 ;for dc
-;	 paddd        xmm4, %3 ;for dc
-	movdqa       %4, %1
-	punpcklqdq   %4, %3
-	packssdw     %1, %3
-	psllw        %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
-	pxor        xmm7,   xmm7
-	movq        xmm0,   [eax]
-	movq        xmm1,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	movq        xmm2,   [eax]
-	movq        xmm3,   [eax+ebx]
-	lea         eax,    [eax+2*ebx]
-	punpcklbw   xmm0,   xmm7
-	punpcklbw   xmm1,   xmm7
-	punpcklbw   xmm2,   xmm7
-	punpcklbw   xmm3,   xmm7
-	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
-	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
-	;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2],   0
-	pinsrw      xmm0,   word[esi+%2+8], 4
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+2],  0
-	pinsrw      xmm0,   word[esi+%2+10], 4
-	psubsw      xmm0,   xmm1
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+4],  0
-	pinsrw      xmm0,   word[esi+%2+12], 4
-	psubsw      xmm0,   xmm3
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-	pxor        xmm0,   xmm0
-	pinsrw      xmm0,   word[esi+%2+6],  0
-	pinsrw      xmm0,   word[esi+%2+14], 4
-	psubsw      xmm0,   xmm2
-	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH  3
-	movq        xmm0,   [esi+%3+8*%1]
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm5,   xmm0
-	pabsw       xmm1,   xmm1
-	pabsw       xmm2,   xmm2
-	pabsw       xmm3,   xmm3
-	paddw       xmm2,   xmm1;for DC
-	paddw       xmm2,   xmm3;for DC
-	paddw       xmm5,   xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
-	pxor        xmm0,   xmm0
-	movq2dq     xmm0,   mm4
-	punpcklqdq  xmm0,   xmm0
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
-	shl         %1,     4
-	movdqa      xmm0,   [esi+32+%1]
-	psubsw      xmm0,   xmm7
-	pabsw       xmm0,   xmm0
-	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
-	SSE41_GetX38x4SatdDec
-	SSE41_GetX38x4SatdV   %1, %2
-	SSE41_GetX38x4SatdH   %1, %2, 16
-	SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
-	pmaddwd     %1, %2
-	movhlps     %3, %1
-	paddd       %1, %3
-	pshuflw     %3, %1,0Eh
-	paddd       %1, %3
-%endmacro
-
-
-%ifdef X86_32
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	pxor        xmm4,   xmm4
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movdqu 		xmm0,   [ecx]
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     8
-	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     10
-	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     12
-	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     14
-	pinsrb      xmm0,   byte[ecx+edx-1], 15
-	movhlps		xmm1,   xmm0
-	punpcklqdq  xmm0,   xmm0
-	punpcklqdq  xmm1,   xmm1
-	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
-	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
-	movdqa      [esi+32], xmm0 ;H
-	movdqa      [esi+48], xmm1
-	movd        ecx,    xmm4 ;dc
-	add         ecx,    16   ;(sum+16)
-	shr         ecx,    5    ;((sum+16)>>5)
-	shl         ecx,    4    ;
-	movd        mm4,    ecx  ; mm4 copy DC
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-	mov         edi,    0
-.loop16x16_get_satd:
-.loopStart1:
-	SSE41_I16x16GetX38x4Satd ecx, edi
-	inc          ecx
-	cmp         ecx, 4
-	jl          .loopStart1
-	cmp         edi, 16
-	je          .loop16x16_get_satd_end
-	mov         eax, [esp+24]
-	add         eax, 8
-	mov         ecx, 0
-	add         edi, 16
-	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov      edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ebx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_16x16
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16
-
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_16x16
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-return_satd_intra_16x16_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
-	movdqa      xmm5,   [HSumSubDB1]
-	movdqa      xmm6,   [HSumSubDW1]
-	movdqa      xmm7,   [PDW1]
-	sub         ecx,    edx
-	movq 		xmm0,   [ecx]
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
-	movdqa      [esi],  xmm0 ;V
-	add         ecx,    edx
-	pinsrb      xmm0,   byte[ecx-1], 0
-	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     2
-	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     4
-	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx]
-	pinsrb      xmm0,   byte[ecx-1],     6
-	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	punpcklqdq  xmm0,   xmm0
-	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
-	movdqa      [esi+16], xmm0 ;H
-;(sum+2)>>2
-	movdqa      xmm6,   [PDQ2]
-	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1
-	paddd       xmm5,   xmm6
-	psrld       xmm5,   2
-;(sum1+sum2+4)>>3
-	paddd       xmm6,   xmm6
-	paddd       xmm4,   xmm1
-	paddd       xmm4,   xmm6
-	psrld       xmm4,   3
-;satd *16
-	pslld       xmm5,   4
-	pslld       xmm4,   4
-;temp satd
-	movdqa      xmm6,   xmm4
-	punpcklqdq  xmm4,   xmm5
-	psllq       xmm4,   32
-	psrlq       xmm4,   32
-	movdqa      [esi+32], xmm4
-	punpckhqdq  xmm5,   xmm6
-	psllq       xmm5,   32
-	psrlq       xmm5,   32
-	movdqa      [esi+48], xmm5
-
-	pxor        xmm4,   xmm4 ;V
-	pxor        xmm5,   xmm5 ;H
-	pxor        xmm6,   xmm6 ;DC
-	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:
-	SSE41_ChromaGetX38x4Satd ecx, 0
-	inc             ecx
-	cmp             ecx, 2
-	jl              loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
-	movdq2q     %2, %1
-	movhlps     %1, %1
-	movdq2q     %3, %1
-%endmacro
-%macro MMXReg2SSE 4
-	movq2dq     %1, %3
-	movq2dq     %2, %4
-	punpcklqdq  %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-	mov    esi,    [esp+40] ;temp_satd
-	xor    edi,    edi
-loop_chroma_satdx3:
-	SSE41_ChromaGetX38x8Satd
-	cmp             edi, 1
-	je              loop_chroma_satdx3end
-	inc             edi
-	SSEReg2MMX  xmm4, mm0,mm1
-	SSEReg2MMX  xmm5, mm2,mm3
-	SSEReg2MMX  xmm6, mm5,mm6
-	mov         ecx,  [esp+44]
-	mov         eax,  [esp+48]
-	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:
-	MMXReg2SSE  xmm0, xmm3, mm0, mm1
-	MMXReg2SSE  xmm1, xmm3, mm2, mm3
-	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-
-	paddw       xmm4, xmm0
-	paddw       xmm5, xmm1
-	paddw       xmm6, xmm2
-
-	MMX_DW_1_2REG    xmm0, xmm1
-	psrlw       xmm4, 1 ;/2
-	psrlw       xmm5, 1 ;/2
-	psrlw       xmm6, 1 ;/2
-	SSE41_HSum8W     xmm4, xmm0, xmm1
-	SSE41_HSum8W     xmm5, xmm0, xmm1
-	SSE41_HSum8W     xmm6, xmm0, xmm1
-	; comparing order: DC H V
-	movd      ebx, xmm6 ;DC
-	movd      edi, xmm5 ;H
-	movd      ecx, xmm4 ;V
-	mov       edx, [esp+36]
-	shl       edx, 1
-	add       edi, edx
-	add       ecx, edx
-	mov       edx, [esp+32]
-	cmp       ebx, edi
-	jge near   not_dc_8x8
-	cmp        ebx, ecx
-	jge near   not_dc_h_8x8
-
-	; for DC mode
-	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx
-	jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
-	; for H mode
-	cmp       edi, ecx
-	jge near   not_dc_h_8x8
-	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi
-	jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
-	; for V mode
-	mov       dword[edx], 2;I8_PRED_V
-	mov       eax, ecx
-return_satd_intra_8x8_x3:
-	WELSEMMS
-	pop         edi
-	pop         esi
-	pop         ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1
-  pshufb      xmm6,xmm1
-  movdqa      %1,  xmm6
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm7
-  paddw       xmm4,xmm0
-  movdqa      xmm0,%2
-  psadbw      xmm0,xmm5
-  paddw       xmm2,xmm0
-  psadbw      xmm6,%2
-  paddw       xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
-    movzx   %2, byte %1
-    mov    %3, %2
-    add     %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-	push   ebx
-	push   esi
-	push   edi
-	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]
-	mov    edi,    [esp+40] ;temp_sad
-	sub    ecx,    edx
-    movdqa      xmm5,[ecx]
-    pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5
-    movhlps     xmm1,xmm0
-    paddw       xmm0,xmm1
-    movd        eax,xmm0
-
-    add         ecx,edx
-    lea         ebx, [edx+2*edx]
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    lea         ecx, [ecx+4*edx]
-    add         edi, 64
-    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
-    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
-    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
-    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
-    sub        edi, 192
-    add         eax,10h
-    shr         eax,5
-    movd        xmm7,eax
-    pxor        xmm1,xmm1
-    pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4
-    pxor        xmm3,xmm3
-    pxor        xmm2,xmm2
-;sad begin
-	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]
-    lea         esi, [ebx+2*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64
-    lea         eax, [eax+4*ebx]
-    SSSE3_Get16BSadHVDC [edi], [eax]
-    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
-    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
-    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
-    pslldq      xmm3,4
-    por         xmm3,xmm2
-    movhlps     xmm1,xmm3
-    paddw       xmm3,xmm1
-    movhlps     xmm0,xmm4
-    paddw       xmm4,xmm0
-; comparing order: DC H V
-	movd        ebx, xmm4 ;DC
-	movd        ecx, xmm3 ;V
-	psrldq      xmm3, 4
-	movd        esi, xmm3 ;H
-	mov         eax, [esp+36] ;lamda
-	shl         eax, 1
-	add         esi, eax
-	add         ebx, eax
-	mov         edx, [esp+32]
-	cmp         ebx, esi
-	jge near   not_dc_16x16_sad
-	cmp        ebx, ecx
-	jge near   not_dc_h_16x16_sad
-	; for DC mode
-	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx
-    sub        edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm7
-%assign x x+1
-%endrep
-	jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
-	; for H mode
-	cmp       esi, ecx
-	jge near   not_dc_h_16x16_sad
-	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi
-	jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
-	; for V mode
-	mov       dword[edx], 0;I16_PRED_V
-	mov       eax, ecx
-    sub       edi, 192
-%assign x 0
-%rep 16
-    movdqa    [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
-	pop    edi
-	pop    esi
-	pop    ebx
-	ret
-%endif
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
-	movq             xmm0, [r0]
-	punpcklqdq       xmm0, xmm0
-	pmaddubsw        xmm0, xmm7
-	movq             xmm1, [r0+r1]
-	punpcklqdq       xmm1, xmm1
-	pmaddubsw        xmm1, xmm7
-	movq             xmm2, [r2]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r2+r3]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	psubsw           xmm0, xmm2
-	psubsw           xmm1, xmm3
-	movq             xmm2, [r0+2*r1]
-	punpcklqdq       xmm2, xmm2
-	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [r0+r4]
-	punpcklqdq       xmm3, xmm3
-	pmaddubsw        xmm3, xmm7
-	movq             xmm4, [r2+2*r3]
-	punpcklqdq       xmm4, xmm4
-	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [r2+r5]
-	punpcklqdq       xmm5, xmm5
-	pmaddubsw        xmm5, xmm7
-	psubsw           xmm2, xmm4
-	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
-	pabsw            xmm0, xmm0
-	pabsw            xmm2, xmm2
-	pabsw            xmm1, xmm1
-	pabsw            xmm3, xmm3
-	movdqa           xmm4, xmm3
-	pblendw          xmm3, xmm1, 0xAA
-	pslld            xmm1, 16
-	psrld            xmm4, 16
-	por              xmm1, xmm4
-	pmaxuw           xmm1, xmm3
-	paddw            xmm6, xmm1
-	movdqa           xmm4, xmm0
-	pblendw          xmm0, xmm2, 0xAA
-	pslld            xmm2, 16
-	psrld            xmm4, 16
-	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2
-	paddw            xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4
-	pmaddwd     %2, %3
-	movhlps     %4, %2
-	paddd       %2, %4
-	pshuflw     %4, %2,0Eh
-	paddd       %2, %4
-	movd		%1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm4,[HSwapSumSubDB1]
-	movd        xmm2,[r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm2,xmm5,0
-	movd        xmm3,[r2+r3*2]
-	lea         r2, [r3*2+r2]
-	movd        xmm5,[r2+r3]
-	shufps      xmm3,xmm5,0
-	movd        xmm0,[r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm0,xmm5,0
-	movd        xmm1,[r0+r1*2]
-	lea         r0, [r1*2+r0]
-	movd        xmm5,[r0+r1]
-	shufps      xmm1,xmm5,0
-	pmaddubsw   xmm0,xmm4
-	pmaddubsw   xmm1,xmm4
-	pmaddubsw   xmm2,xmm4
-	pmaddubsw   xmm3,xmm4
-	psubw       xmm0,xmm2
-	psubw       xmm1,xmm3
-	movdqa      xmm2,xmm0
-	paddw       xmm0,xmm1
-	psubw       xmm1,xmm2
-	movdqa      xmm2,xmm0
-	punpcklqdq  xmm0,xmm1
-	punpckhqdq  xmm2,xmm1
-	movdqa      xmm1,xmm0
-	paddw       xmm0,xmm2
-	psubw       xmm2,xmm1
-	movdqa      xmm1,xmm0
-	pblendw     xmm0,xmm2,0AAh
-	pslld       xmm2,16
-	psrld       xmm1,16
-	por         xmm2,xmm1
-	pabsw       xmm0,xmm0
-	pabsw       xmm2,xmm2
-	pmaxsw      xmm0,xmm2
-	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6, xmm6
-	SSE41_GetSatd8x4
-	lea			r0,	 [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-%ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
-%endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor        xmm6, xmm6
-	mov         r6,    0
-loop_get_satd_8x16:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_8x16
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	push  r0
-	push  r2
-
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-
-	pop  r2
-	pop  r0
-	add			r0,    8
-	add			r2,    8
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	SSE41_GetSatd8x4
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-%ifdef X86_32
-	push  r4
-	push  r5
-	push  r6
-%endif
-	%assign  push_num 3
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-
-	push  r0
-	push  r2
-
-	movdqa      xmm7, [HSumSubDB1]
-	lea         r4,  [r1+r1*2]
-	lea         r5,  [r3+r3*2]
-	pxor		xmm6,   xmm6
-	mov         r6,    0
-loop_get_satd_16x16_left:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_left
-
-	pop  r2
-	pop  r0
-	add			r0,    8
-	add			r2,    8
-	mov         r6,    0
-loop_get_satd_16x16_right:
-	SSE41_GetSatd8x4
-	lea			r0,  [r0+4*r1]
-	lea			r2,  [r2+4*r3]
-	inc         r6
-	cmp         r6,  4
-	jl          loop_get_satd_16x16_right
-	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
-	POP_XMM
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r6
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqu xmm1,   [r2]
-	MOVDQ  xmm2,   [r0];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	paddw  xmm7,   xmm0
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+2*r3]
-	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-	movdqu xmm1,   [r2+r5]
-	MOVDQ  xmm2,   [r0+r4]
-	psadbw xmm1,   xmm2
-	paddw  xmm7,   xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
-	movq   xmm0,   [r0]
-	movq   xmm1,   [r0+r1]
-	lea    r0,     [r0+2*r1]
-	movhps xmm0,   [r0]
-	movhps xmm1,   [r0+r1]
-
-	movq   xmm2,   [r2]
-	movq   xmm3,   [r2+r3]
-	lea    r2,     [r2+2*r3]
-	movhps xmm2,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-%ifdef X86_32
-	push  r4
-	push  r5
-%endif
-
-	%assign  push_num 2
-	LOAD_4_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	lea r4, [3*r1]
-	lea r5, [3*r3]
-
-	pxor   xmm7,   xmm7
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	lea	   r0,  [r0+4*r1]
-	lea	   r2,  [r2+4*r3]
-	SSE2_GetSad4x16
-	movhlps xmm0, xmm7
-	paddw xmm0, xmm7
-	movd retrd, xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
-%ifdef X86_32
-	pop  r5
-	pop  r4
-%endif
-	ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movdqu xmm0,   [r2]
-	MOVDQ  xmm2,   [r0]
-	psadbw xmm0,   xmm2
-	movdqu xmm1,   [r2+r3]
-	MOVDQ  xmm2,   [r0+r1]
-	psadbw xmm1,   xmm2
-	paddw  xmm0,   xmm1
-
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-	SSE2_GetSad2x16
-
-	movhlps     xmm1, xmm0
-	paddw       xmm0, xmm1
-	movd        retrd,  xmm0
-	LOAD_4_PARA_POP
-	ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 7
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-    pxor   xmm6,   xmm6
-
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	SSE2_GetSad8x4
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
-	ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-	%assign  push_num 0
-	mov		r2,  arg3
-	push	r2
-	CACHE_SPLIT_CHECK r2, 8, 64
-	jle    near   .pixel_sad_8x8_nsplit
-	pop		r2
-%ifdef X86_32
-	push	r3
-	push	r4
-	push	r5
-%endif
-	%assign  push_num 3
-	PUSH_XMM 8
-	mov		r0,  arg1
-	mov		r1,  arg2
-	SIGN_EXTENSION r1, r1d
-    pxor   xmm7,   xmm7
-
-    ;ecx r2, edx r4, edi r5
-
-    mov    r5,    r2
-    and    r5,    0x07
-    sub    r2,    r5
-    mov    r4,    8
-    sub    r4,    r5
-
-    shl    r5,    3
-    shl    r4,    3
-    movd   xmm5,   r5d
-    movd   xmm6,   r4d
-	mov    r5,    8
-	add    r5,    r2
-    mov    r3,    arg4
-	SIGN_EXTENSION r3, r3d
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	lea    r5,    [r5+2*r3]
-
-    movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-
-	movq   xmm1,   [r2]
-	movq   xmm2,   [r5]
-	movhps xmm1,   [r2+r3]
-	movhps xmm2,   [r5+r3]
-	psrlq  xmm1,   xmm5
-	psllq  xmm2,   xmm6
-	por    xmm1,   xmm2
-
-	psadbw xmm0,   xmm1
-	paddw  xmm7,   xmm0
-
-    movhlps    xmm0, xmm7
-	paddw      xmm0, xmm7
-	movd       retrd,  xmm0
-	POP_XMM
-%ifdef X86_32
-	pop	 r5
-	pop	 r4
-	pop	 r3
-%endif
-	jmp        .return
-
-.pixel_sad_8x8_nsplit:
-
-	pop r2
-	%assign  push_num 0
-	LOAD_4_PARA
-	PUSH_XMM 7
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm6,   xmm6
-	SSE2_GetSad8x4
-    lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-    SSE2_GetSad8x4
-    movhlps    xmm0, xmm6
-	paddw      xmm0, xmm6
-	movd       retrd,  xmm0
-	POP_XMM
-	LOAD_4_PARA_POP
-.return:
-	ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
-	psadbw %1,   %4
-	paddw  xmm5, %1
-	psadbw %4,   %3
-	paddw  xmm4, %4
-	movdqu %4,   [%5-1]
-	psadbw %4,   %2
-	paddw  xmm6, %4
-	movdqu %4,   [%5+1]
-	psadbw %4,   %2
-	paddw  xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw  xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw  xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw  xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm2,   xmm3
-	paddw xmm5,   xmm2
-
-	movdqu xmm2,   [r2-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movdqa xmm0,   [r0]
-	sub    r2,    r3
-	movdqu xmm3,   [r2]
-	psadbw xmm3,   xmm0
-	paddw xmm4,   xmm3
-
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm3,   xmm1
-	paddw xmm4,   xmm3
-
-	movdqu xmm2,   [r2+r3-1]
-	psadbw xmm2,   xmm0
-	paddw xmm6,   xmm2
-
-	movdqu xmm3,   [r2+r3+1]
-	psadbw xmm3,   xmm0
-	paddw xmm7,   xmm3
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm2,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
-	movdqa xmm0,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm1,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
-	movdqa xmm2,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movdqa xmm0,   [r0]
-	movdqu xmm3,   [r2]
-	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
-	movdqa xmm1,   [r0+r1]
-	movdqu xmm3,   [r2+r3]
-	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
-	lea    r2,    [r2+2*r3]
-	movdqu xmm3,   [r2]
-	psadbw xmm0,   xmm3
-	paddw xmm5,   xmm0
-
-	movdqu xmm0,   [r2-1]
-	psadbw xmm0,   xmm1
-	paddw xmm6,   xmm0
-
-	movdqu xmm3,   [r2+1]
-	psadbw xmm3,   xmm1
-	paddw xmm7,   xmm3
-
-	movdqu xmm3,   [r2+r3]
-	psadbw xmm1,   xmm3
-	paddw xmm5,   xmm1
-
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	PUSH_XMM 8
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
-	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
-	pxor   xmm6,   xmm6    ;sad pRefMb-1
-	pxor   xmm7,   xmm7    ;sad pRefMb+1
-	movq   xmm0,   [r0]
-	movhps xmm0,   [r0+r1]
-	sub    r2,    r3
-	movq   xmm3,   [r2]
-	movhps xmm3,   [r2+r3]
-	psadbw xmm3,   xmm0
-	paddw  xmm4,   xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movq   xmm0,  [r0]
-	movhps xmm0,  [r0+r1]
-	psadbw xmm3,  xmm0
-	paddw  xmm4,  xmm3
-
-
-	movq   xmm1,  [r2+r3-1]
-	movq   xmm3,  [r2+r3+1]
-
-	lea    r0,    [r0+2*r1]
-	lea    r2,    [r2+2*r3]
-	movhps xmm1,  [r2-1]
-	movhps xmm3,  [r2+1]
-
-	psadbw xmm1,  xmm0
-	paddw  xmm6,  xmm1
-	psadbw xmm3,  xmm0
-	paddw  xmm7,  xmm3
-
-	movq   xmm3,  [r2]
-	movhps xmm3,  [r2+r3]
-	psadbw xmm0,  xmm3
-	paddw  xmm5,  xmm0
-
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0
-	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0
-	movhlps    xmm0, xmm7
-	paddw      xmm7, xmm0
-	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6
-	movdqa     [r4],xmm4
-	POP_XMM
-	LOAD_5_PARA_POP
-	ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-	%assign  push_num 0
-	LOAD_5_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movd   xmm0,   [r0]
-	movd   xmm1,   [r0+r1]
-	lea        r0,    [r0+2*r1]
-	movd       xmm2,   [r0]
-	movd       xmm3,   [r0+r1]
-	punpckldq  xmm0, xmm1
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm0, xmm2
-	sub        r2,  r3
-	movd       xmm1, [r2]
-	movd       xmm2, [r2+r3]
-	punpckldq  xmm1, xmm2
-	movd       xmm2, [r2+r3-1]
-	movd       xmm3, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-
-	movd       xmm4, [r2]
-	movd       xmm5, [r2-1]
-	punpckldq  xmm2, xmm5
-	movd       xmm5, [r2+1]
-	punpckldq  xmm3, xmm5
-
-	movd       xmm5, [r2+r3]
-	punpckldq  xmm4, xmm5
-
-	punpcklqdq xmm1, xmm4 ;-L
-
-	movd       xmm5, [r2+r3-1]
-	movd       xmm6, [r2+r3+1]
-
-	lea        r2,  [r2+2*r3]
-	movd       xmm7, [r2-1]
-	punpckldq  xmm5, xmm7
-	punpcklqdq xmm2, xmm5 ;-1
-	movd       xmm7, [r2+1]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm3, xmm6 ;+1
-	movd       xmm6, [r2]
-	movd       xmm7, [r2+r3]
-	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L
-	psadbw     xmm1, xmm0
-	psadbw     xmm2, xmm0
-	psadbw     xmm3, xmm0
-	psadbw     xmm4, xmm0
-
-	movhlps    xmm0, xmm1
-	paddw      xmm1, xmm0
-	movhlps    xmm0, xmm2
-	paddw      xmm2, xmm0
-	movhlps    xmm0, xmm3
-	paddw      xmm3, xmm0
-	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0
-	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3
-	punpcklqdq xmm1, xmm2
-	movdqa     [r4],xmm1
-	LOAD_5_PARA_POP
-	ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;   int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WELS_EXTERN WelsSampleSad4x4_mmx
-    %assign  push_num 0
-	LOAD_4_PARA
-	SIGN_EXTENSION r1, r1d
-	SIGN_EXTENSION r3, r3d
-	movd	  mm0, [r0]
-	movd	  mm1, [r0+r1]
-	punpckldq mm0, mm1
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm0, mm3
-
-	lea       r0, [r0+2*r1]
-	lea       r2, [r2+2*r3]
-
-	movd      mm1, [r0]
-	movd      mm2, [r0+r1]
-	punpckldq mm1, mm2
-
-	movd      mm3, [r2]
-	movd      mm4, [r2+r3]
-	punpckldq mm3, mm4
-	psadbw    mm1, mm3
-	paddw     mm0, mm1
-
-    movd      retrd, mm0
-
-	WELSEMMS
-    LOAD_4_PARA_POP
-    ret
--- /dev/null
+++ b/codec/common/src/WelsThreadLib.cpp
@@ -1,0 +1,473 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	WelsThreadLib.c
+ *
+ * \brief	Interfaces introduced in thread programming
+ *
+ * \date	11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+
+#ifdef LINUX
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <sched.h>
+#elif !defined(_WIN32)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/param.h>
+#include <unistd.h>
+#ifdef __APPLE__
+#define HW_NCPU_NAME "hw.logicalcpu"
+#else
+#define HW_NCPU_NAME "hw.ncpu"
+#endif
+#endif
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+
+#include "WelsThreadLib.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#ifdef  _WIN32
+
+#ifdef WINAPI_FAMILY
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define InitializeCriticalSection(x) InitializeCriticalSectionEx(x, 0, 0)
+#endif
+#endif
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  InitializeCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  EnterCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  LeaveCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  DeleteCriticalSection (mutex);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+#else /* _WIN32 */
+
+WELS_THREAD_ERROR_CODE    WelsMutexInit (WELS_MUTEX*    mutex) {
+  return pthread_mutex_init (mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexLock (WELS_MUTEX*    mutex) {
+  return pthread_mutex_lock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexUnlock (WELS_MUTEX* mutex) {
+  return pthread_mutex_unlock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMutexDestroy (WELS_MUTEX* mutex) {
+  return pthread_mutex_destroy (mutex);
+}
+
+#endif /* !_WIN32 */
+
+
+#ifdef MT_ENABLED
+
+#ifdef _WIN32
+
+void WelsSleep (uint32_t dwMilliseconds) {
+  Sleep (dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT* event, const char* event_name) {
+  WELS_EVENT   h = CreateEvent (NULL, FALSE, FALSE, NULL);
+
+  if (h == NULL) {
+    return WELS_THREAD_ERROR_GENERAL;
+  }
+  *event = h;
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventSignal (WELS_EVENT* event) {
+  if (SetEvent (*event)) {
+    return WELS_THREAD_ERROR_OK;
+  }
+  return WELS_THREAD_ERROR_GENERAL;
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWait (WELS_EVENT* event) {
+  return WaitForSingleObject (*event, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
+  return WaitForSingleObject (*event, dwMilliseconds);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
+    WELS_EVENT* event_list, WELS_EVENT* master_event) {
+  // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
+  return WaitForMultipleObjects (nCount, event_list, FALSE, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
+    WELS_EVENT* event_list, WELS_EVENT* master_event) {
+  // Don't need/use the master event for anything, since windows has got WaitForMultipleObjects
+  return WaitForMultipleObjects (nCount, event_list, TRUE, INFINITE);
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, const char* event_name) {
+  CloseHandle (*event);
+
+  *event = NULL;
+  return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
+    void* arg, WELS_THREAD_ATTR attr) {
+  WELS_THREAD_HANDLE   h = CreateThread (NULL, 0, routine, arg, 0, NULL);
+
+  if (h == NULL) {
+    return WELS_THREAD_ERROR_GENERAL;
+  }
+  * thread = h;
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread) {
+  WaitForSingleObject (thread, INFINITE);
+  CloseHandle (thread);
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+
+WELS_THREAD_HANDLE        WelsThreadSelf() {
+  return GetCurrentThread();
+}
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
+  SYSTEM_INFO  si;
+
+  GetSystemInfo (&si);
+
+  pInfo->ProcessorCount = si.dwNumberOfProcessors;
+
+  return WELS_THREAD_ERROR_OK;
+}
+
+#else
+
+void WelsSleep (uint32_t dwMilliseconds) {
+  usleep (dwMilliseconds * 1000);	// microseconds
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadCreate (WELS_THREAD_HANDLE* thread,  LPWELS_THREAD_ROUTINE  routine,
+    void* arg, WELS_THREAD_ATTR attr) {
+  WELS_THREAD_ERROR_CODE err = 0;
+
+  pthread_attr_t at;
+  err = pthread_attr_init (&at);
+  if (err)
+    return err;
+#ifndef __ANDROID__
+  err = pthread_attr_setscope (&at, PTHREAD_SCOPE_SYSTEM);
+  if (err)
+    return err;
+  err = pthread_attr_setschedpolicy (&at, SCHED_FIFO);
+  if (err)
+    return err;
+#endif
+  err = pthread_create (thread, &at, routine, arg);
+
+  pthread_attr_destroy (&at);
+
+  return err;
+}
+
+WELS_THREAD_ERROR_CODE    WelsThreadJoin (WELS_THREAD_HANDLE  thread) {
+  return pthread_join (thread, NULL);
+}
+
+WELS_THREAD_HANDLE        WelsThreadSelf() {
+  return pthread_self();
+}
+
+// unnamed semaphores aren't supported on OS X
+
+WELS_THREAD_ERROR_CODE    WelsEventOpen (WELS_EVENT* p_event, const char* event_name) {
+#ifdef __APPLE__
+  if (p_event == NULL || event_name == NULL)
+    return WELS_THREAD_ERROR_GENERAL;
+  *p_event = sem_open (event_name, O_CREAT, (S_IRUSR | S_IWUSR)/*0600*/, 0);
+  if (*p_event == (sem_t*)SEM_FAILED) {
+    sem_unlink (event_name);
+    *p_event = NULL;
+    return WELS_THREAD_ERROR_GENERAL;
+  } else {
+    return WELS_THREAD_ERROR_OK;
+  }
+#else
+  WELS_EVENT event = (WELS_EVENT) malloc(sizeof(*event));
+  if (event == NULL)
+    return WELS_THREAD_ERROR_GENERAL;
+  WELS_THREAD_ERROR_CODE err = sem_init(event, 0, 0);
+  if (!err) {
+    *p_event = event;
+    return err;
+  }
+  free(event);
+  return err;
+#endif
+}
+WELS_THREAD_ERROR_CODE    WelsEventClose (WELS_EVENT* event, const char* event_name) {
+#ifdef __APPLE__
+  WELS_THREAD_ERROR_CODE err = sem_close (*event);	// match with sem_open
+  if (event_name)
+    sem_unlink (event_name);
+  return err;
+#else
+  WELS_THREAD_ERROR_CODE err = sem_destroy (*event);	// match with sem_init
+  free(*event);
+  return err;
+#endif
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventSignal (WELS_EVENT* event) {
+  WELS_THREAD_ERROR_CODE err = 0;
+//	int32_t val = 0;
+//	sem_getvalue(event, &val);
+//	fprintf( stderr, "before signal it, val= %d..\n",val );
+  err = sem_post (*event);
+//	sem_getvalue(event, &val);
+//	fprintf( stderr, "after signal it, val= %d..\n",val );
+  return err;
+}
+
+WELS_THREAD_ERROR_CODE   WelsEventWait (WELS_EVENT* event) {
+  return sem_wait (*event);	// blocking until signaled
+}
+
+WELS_THREAD_ERROR_CODE    WelsEventWaitWithTimeOut (WELS_EVENT* event, uint32_t dwMilliseconds) {
+  if (dwMilliseconds != (uint32_t) - 1) {
+    return sem_wait (*event);
+  } else {
+#if defined(__APPLE__)
+    int32_t err = 0;
+    int32_t wait_count = 0;
+    do {
+      err = sem_trywait (*event);
+      if (WELS_THREAD_ERROR_OK == err)
+        break;// WELS_THREAD_ERROR_OK;
+      else if (wait_count > 0)
+        break;
+      usleep (dwMilliseconds * 1000);
+      ++ wait_count;
+    } while (1);
+    return err;
+#else
+    struct timespec ts;
+    struct timeval tv;
+
+    gettimeofday (&tv, 0);
+
+    ts.tv_nsec = tv.tv_usec * 1000 + dwMilliseconds * 1000000;
+    ts.tv_sec = tv.tv_sec + ts.tv_nsec / 1000000000;
+    ts.tv_nsec %= 1000000000;
+
+    return sem_timedwait (*event, &ts);
+#endif//__APPLE__
+  }
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitSingleBlocking (uint32_t nCount,
+    WELS_EVENT* event_list, WELS_EVENT* master_event) {
+  uint32_t nIdx = 0;
+  uint32_t uiAccessTime = 2;	// 2 us once
+
+  if (nCount == 0)
+    return WELS_THREAD_ERROR_WAIT_FAILED;
+
+  if (master_event != NULL) {
+    // This design relies on the events actually being semaphores;
+    // if multiple events in the list have been signalled, the master
+    // event should have a similar count (events in windows can't keep
+    // track of the actual count, but the master event isn't needed there
+    // since it uses WaitForMultipleObjects).
+    int32_t err = sem_wait (*master_event);
+    if (err != WELS_THREAD_ERROR_OK)
+      return err;
+    uiAccessTime = 0; // no blocking, just quickly loop through all to find the one that was signalled
+  }
+
+  while (1) {
+    nIdx = 0;	// access each event by order
+    while (nIdx < nCount) {
+      int32_t err = 0;
+      int32_t wait_count = 0;
+
+      /*
+       * although such interface is not used in __GNUC__ like platform, to use
+       * pthread_cond_timedwait() might be better choice if need
+       */
+      do {
+        err = sem_trywait (event_list[nIdx]);
+        if (WELS_THREAD_ERROR_OK == err)
+          return WELS_THREAD_ERROR_WAIT_OBJECT_0 + nIdx;
+        else if (wait_count > 0 || uiAccessTime == 0)
+          break;
+        usleep (uiAccessTime);
+        ++ wait_count;
+      } while (1);
+      // we do need access next event next time
+      ++ nIdx;
+    }
+    usleep (1);	// switch to working threads
+    if (master_event != NULL) {
+      // A master event was used and was signalled, but none of the events in the
+      // list was found to be signalled, thus wait a little more when rechecking
+      // the list to avoid busylooping here.
+      // If we ever hit this codepath it's mostly a bug in the code that signals
+      // the events.
+      uiAccessTime = 2;
+    }
+  }
+
+  return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE    WelsMultipleEventsWaitAllBlocking (uint32_t nCount,
+    WELS_EVENT* event_list, WELS_EVENT* master_event) {
+  uint32_t nIdx = 0;
+  uint32_t uiCountSignals = 0;
+  uint32_t uiSignalFlag	= 0;	// UGLY: suppose maximal event number up to 32
+
+  if (nCount == 0 || nCount > (sizeof (uint32_t) << 3))
+    return WELS_THREAD_ERROR_WAIT_FAILED;
+
+  while (1) {
+    nIdx = 0;	// access each event by order
+    while (nIdx < nCount) {
+      const uint32_t kuiBitwiseFlag = (1 << nIdx);
+
+      if ((uiSignalFlag & kuiBitwiseFlag) != kuiBitwiseFlag) { // non-blocking mode
+        int32_t err = 0;
+//				fprintf( stderr, "sem_wait(): start to wait event %d..\n", nIdx );
+        if (master_event == NULL) {
+          err = sem_wait (event_list[nIdx]);
+        } else {
+          err = sem_wait (*master_event);
+          if (err == WELS_THREAD_ERROR_OK) {
+            err = sem_wait (event_list[nIdx]);
+            if (err != WELS_THREAD_ERROR_OK) {
+              // We successfully waited for the master event,
+              // but waiting for the individual event failed (e.g. EINTR?).
+              // Increase the master event count so that the next retry will
+              // work as intended.
+              sem_post (*master_event);
+            }
+          }
+        }
+//				fprintf( stderr, "sem_wait(): wait event %d result %d errno %d..\n", nIdx, err, errno );
+        if (WELS_THREAD_ERROR_OK == err) {
+//					int32_t val = 0;
+//					sem_getvalue(&event_list[nIdx], &val);
+//					fprintf( stderr, "after sem_timedwait(), event_list[%d] semaphore value= %d..\n", nIdx, val);
+
+          uiSignalFlag |= kuiBitwiseFlag;
+          ++ uiCountSignals;
+          if (uiCountSignals >= nCount) {
+            return WELS_THREAD_ERROR_OK;
+          }
+        }
+      }
+      // we do need access next event next time
+      ++ nIdx;
+    }
+  }
+
+  return WELS_THREAD_ERROR_WAIT_FAILED;
+}
+
+WELS_THREAD_ERROR_CODE    WelsQueryLogicalProcessInfo (WelsLogicalProcessInfo* pInfo) {
+#ifdef ANDROID_NDK
+  pInfo->ProcessorCount = android_getCpuCount();
+  return WELS_THREAD_ERROR_OK;
+#elif defined(LINUX)
+
+  cpu_set_t cpuset;
+
+  CPU_ZERO (&cpuset);
+
+  if (!sched_getaffinity (0, sizeof (cpuset), &cpuset))
+    pInfo->ProcessorCount = CPU_COUNT (&cpuset);
+  else
+    pInfo->ProcessorCount = 1;
+
+  return WELS_THREAD_ERROR_OK;
+
+#else
+
+  size_t len = sizeof (pInfo->ProcessorCount);
+
+  if (sysctlbyname (HW_NCPU_NAME, &pInfo->ProcessorCount, &len, NULL, 0) == -1)
+    pInfo->ProcessorCount = 1;
+
+  return WELS_THREAD_ERROR_OK;
+
+#endif//LINUX
+}
+
+#endif
+
+
+#endif // MT_ENABLED
+
--- /dev/null
+++ b/codec/common/src/cpu.cpp
@@ -1,0 +1,293 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	cpu.cpp
+ *
+ * \brief	CPU compatibility detection
+ *
+ * \date	04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+#include <string.h>
+#include <stdio.h>
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+#include "cpu.h"
+#include "cpu_core.h"
+
+
+
+#define    CPU_Vendor_AMD    "AuthenticAMD"
+#define    CPU_Vendor_INTEL  "GenuineIntel"
+#define    CPU_Vendor_CYRIX  "CyrixInstead"
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  uint32_t uiCPU = 0;
+  uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+  int32_t  CacheLineSize = 0;
+  int8_t   chVendorName[16] = { 0 };
+  uint32_t uiMaxCpuidLevel = 0;
+
+  if (!WelsCPUIdVerify()) {
+    /* cpuid is not supported in cpu */
+    return 0;
+  }
+
+  WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVendorName[0], (uint32_t*)&chVendorName[8], (uint32_t*)&chVendorName[4]);
+  uiMaxCpuidLevel = uiFeatureA;
+  if (uiMaxCpuidLevel == 0) {
+    /* maximum input value for basic cpuid information */
+    return 0;
+  }
+
+  WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+  if ((uiFeatureD & 0x00800000) == 0) {
+    /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+    return 0;
+  }
+
+  uiCPU = WELS_CPU_MMX;
+  if (uiFeatureD & 0x02000000) {
+    /* SSE technology is identical to AMD MMX extensions */
+    uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+  }
+  if (uiFeatureD & 0x04000000) {
+    /* SSE2 support here */
+    uiCPU |= WELS_CPU_SSE2;
+  }
+  if (uiFeatureD & 0x00000001) {
+    /* x87 FPU on-chip checking */
+    uiCPU |= WELS_CPU_FPU;
+  }
+  if (uiFeatureD & 0x00008000) {
+    /* CMOV instruction checking */
+    uiCPU |= WELS_CPU_CMOV;
+  }
+  if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) ||
+      (!strcmp((const char*)chVendorName, CPU_Vendor_AMD)) ) {	// confirmed_safe_unsafe_usage
+    if (uiFeatureD & 0x10000000) {
+      /* Multi-Threading checking: contains of multiple logic processors */
+      uiCPU |= WELS_CPU_HTT;
+    }
+  }
+
+  if (uiFeatureC & 0x00000001) {
+    /* SSE3 support here */
+    uiCPU |= WELS_CPU_SSE3;
+  }
+  if (uiFeatureC & 0x00000200) {
+    /* SSSE3 support here */
+    uiCPU |= WELS_CPU_SSSE3;
+  }
+  if (uiFeatureC & 0x00080000) {
+    /* SSE4.1 support here, 45nm Penryn processor */
+    uiCPU |= WELS_CPU_SSE41;
+  }
+  if (uiFeatureC & 0x00100000) {
+    /* SSE4.2 support here, next generation Nehalem processor */
+    uiCPU |= WELS_CPU_SSE42;
+  }
+  if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) {
+    /* AVX supported */
+    uiCPU |= WELS_CPU_AVX;
+  }
+  if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) {
+    /* AVX FMA supported */
+    uiCPU |= WELS_CPU_FMA;
+  }
+  if (uiFeatureC & 0x02000000) {
+    /* AES checking */
+    uiCPU |= WELS_CPU_AES;
+  }
+  if (uiFeatureC & 0x00400000) {
+    /* MOVBE checking */
+    uiCPU |= WELS_CPU_MOVBE;
+  }
+
+  if( pNumberOfLogicProcessors != NULL ){
+    if( uiCPU & WELS_CPU_HTT){
+      *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+    } else {
+      *pNumberOfLogicProcessors = 0;
+    }
+    if( !strcmp((const char*)chVendorName, CPU_Vendor_INTEL) ){
+      if( uiMaxCpuidLevel >= 4 ){
+        uiFeatureC = 0;
+        WelsCPUId(0x4, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+        if( uiFeatureA != 0 ){
+          *pNumberOfLogicProcessors = ((uiFeatureA&0xfc000000)>>26) + 1;
+        }
+      }
+    }
+  }
+
+  WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+  if ((!strcmp ((const char*)chVendorName, CPU_Vendor_AMD))
+      && (uiFeatureA >= 0x80000001)) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    if (uiFeatureD & 0x00400000) {
+      uiCPU |= WELS_CPU_MMXEXT;
+    }
+    if (uiFeatureD & 0x80000000) {
+      uiCPU |= WELS_CPU_3DNOW;
+    }
+  }
+
+  if (!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL)) {	// confirmed_safe_unsafe_usage
+    int32_t  family, model;
+
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+    family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+    model  = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+    if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+      uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+    }
+  }
+
+  // get cache line size
+  if ((!strcmp ((const char*)chVendorName, CPU_Vendor_INTEL))
+      || ! (strcmp ((const char*)chVendorName, CPU_Vendor_CYRIX))) {	// confirmed_safe_unsafe_usage
+    WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+    CacheLineSize = (uiFeatureB & 0xff00) >>
+                    5;	// ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+    if (CacheLineSize == 128) {
+      uiCPU |= WELS_CPU_CACHELINE_128;
+    } else if (CacheLineSize == 64) {
+      uiCPU |= WELS_CPU_CACHELINE_64;
+    } else if (CacheLineSize == 32) {
+      uiCPU |= WELS_CPU_CACHELINE_32;
+    } else if (CacheLineSize == 16) {
+      uiCPU |= WELS_CPU_CACHELINE_16;
+    }
+  }
+
+  return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+  if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+    WelsEmms();
+  }
+}
+
+#elif defined(HAVE_NEON) //For supporting both android platform and iOS platform
+#if defined(ANDROID_NDK)
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
+{
+  uint32_t         uiCPU = 0;
+  AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
+  uint64_t         uiFeatures = 0;
+  cpuFamily = android_getCpuFamily();
+  if (cpuFamily == ANDROID_CPU_FAMILY_ARM)	{
+    uiFeatures = android_getCpuFeatures();
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
+      uiCPU |= WELS_CPU_ARMv7;
+    }
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
+      uiCPU |= WELS_CPU_VFPv3;
+    }
+    if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
+      uiCPU |= WELS_CPU_NEON;
+    }
+  }
+
+  if( pNumberOfLogicProcessors != NULL ){
+    *pNumberOfLogicProcessors = android_getCpuCount();
+  }
+
+  return uiCPU;
+}
+
+#elif defined(__APPLE__)
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors)
+{
+    uint32_t       uiCPU = 0;
+
+#if defined(__ARM_NEON__)
+    uiCPU |= WELS_CPU_ARMv7;
+    uiCPU |= WELS_CPU_VFPv3;
+    uiCPU |= WELS_CPU_NEON;
+#endif
+    return uiCPU;
+}
+#elif defined(__linux__)
+
+/* Generic arm/linux cpu feature detection */
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  FILE *f = fopen("/proc/cpuinfo", "r");
+
+  if (!f)
+    return 0;
+
+  char buf[200];
+  int flags = 0;
+  while (fgets(buf, sizeof(buf), f)) {
+    if (!strncmp(buf, "Features", strlen("Features"))) {
+      if (strstr(buf, " neon "))
+        flags |= WELS_CPU_NEON;
+      if (strstr(buf, " vfpv3 "))
+        flags |= WELS_CPU_VFPv3;
+      break;
+    }
+  }
+  fclose(f);
+  return flags;
+}
+
+#else /* HAVE_NEON enabled but no runtime detection */
+
+/* No runtime feature detection available, but built with HAVE_NEON - assume
+ * that NEON and all associated features are available. */
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  return WELS_CPU_ARMv7 |
+         WELS_CPU_VFPv3 |
+         WELS_CPU_NEON;
+}
+#endif
+#else /* Neither X86_ASM nor HAVE_NEON */
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+  return 0;
+}
+
+#endif
+
+
--- /dev/null
+++ b/codec/common/src/crt_util_safe_x.cpp
@@ -1,0 +1,256 @@
+/*!
+ * \copy
+ *     Copyright (c)  2009-2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file	crt_utils_safe_x.cpp
+ *
+ * \brief	common tool/function utilization
+ *
+ * \date	03/10/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+#if defined(_WIN32)
+#include <windows.h>
+#include <sys/types.h>
+#include <sys/timeb.h>
+#ifndef _MSC_VER
+#include <sys/time.h>
+#endif //!_MSC_VER
+#else
+#include <sys/time.h>
+#include <sys/timeb.h>
+#endif //_WIN32
+
+#include "macros.h"
+#include "crt_util_safe_x.h"	// Safe CRT routines like utils for cross platforms
+
+#if  defined(_WIN32) && defined(_MSC_VER)
+
+#if  defined(_MSC_VER) && (_MSC_VER>=1500)
+
+int32_t WelsSnprintf (char* pBuffer,  int32_t iSizeOfBuffer, const char* kpFormat, ...) {
+  va_list  pArgPtr;
+  int32_t  iRc;
+
+  va_start (pArgPtr, kpFormat);
+
+  iRc = vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
+
+  va_end (pArgPtr);
+
+  return iRc;
+}
+
+char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+  strncpy_s (pDest, iSizeInBytes, kpSrc, _TRUNCATE);
+
+  return pDest;
+}
+
+int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
+  return vsnprintf_s (pBuffer, iSizeOfBuffer, _TRUNCATE, kpFormat, pArgPtr);
+}
+
+WelsFileHandle* WelsFopen (const char* kpFilename,  const char* kpMode) {
+  WelsFileHandle* pFp = NULL;
+  if (fopen_s (&pFp, kpFilename, kpMode) != 0) {
+    return NULL;
+  }
+
+  return pFp;
+}
+
+int32_t WelsFclose (WelsFileHandle* pFp) {
+  return fclose (pFp);
+}
+
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+  return _ftime_s (pTp);
+}
+
+int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
+  struct tm   sTimeNow;
+  int32_t iRc;
+
+  localtime_s (&sTimeNow, &kpTp->time);
+
+  iRc = strftime (pBuffer, iSize, kpFormat, &sTimeNow);
+  if (iRc == 0)
+      pBuffer[0] = '\0';
+  return iRc;
+}
+
+#else
+
+int32_t WelsSnprintf (char* pBuffer,  int32_t iSizeOfBuffer, const char* kpFormat, ...) {
+  va_list pArgPtr;
+  int32_t iRc;
+
+  va_start (pArgPtr, kpFormat);
+
+  iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
+  if (iRc < 0)
+    pBuffer[iSizeOfBuffer - 1] = '\0';
+
+  va_end (pArgPtr);
+
+  return iRc;
+}
+
+char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+  strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
+  pDest[iSizeInBytes - 1] = '\0';
+
+  return pDest;
+}
+
+int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
+  int32_t iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
+  if (iRc < 0)
+    pBuffer[iSizeOfBuffer - 1] = '\0';
+  return iRc;
+}
+
+
+WelsFileHandle* WelsFopen (const char* kpFilename,  const char* kpMode) {
+  return fopen (kpFilename, kpMode);
+}
+
+int32_t WelsFclose (WelsFileHandle* pFp) {
+  return fclose (pFp);
+}
+
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+  _ftime (pTp);
+  return 0;
+}
+
+int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
+  struct tm*   pTnow;
+  int32_t iRc;
+
+  pTnow = localtime (&kpTp->time);
+
+  iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
+  if (iRc == 0)
+      pBuffer[0] = '\0';
+  return iRc;
+}
+
+
+#endif // _MSC_VER
+
+#else  //GCC
+
+int32_t WelsSnprintf (char* pBuffer,  int32_t iSizeOfBuffer, const char* kpFormat, ...) {
+  va_list pArgPtr;
+  int32_t iRc;
+
+  va_start (pArgPtr, kpFormat);
+
+  iRc = vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr);
+
+  va_end (pArgPtr);
+
+  return iRc;
+}
+
+char* WelsStrncpy (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+  strncpy (pDest, kpSrc, iSizeInBytes); //confirmed_safe_unsafe_usage
+  pDest[iSizeInBytes - 1] = '\0';
+  return pDest;
+}
+
+int32_t WelsVsnprintf (char* pBuffer, int32_t iSizeOfBuffer, const char* kpFormat, va_list pArgPtr) {
+  return vsnprintf (pBuffer, iSizeOfBuffer, kpFormat, pArgPtr); //confirmed_safe_unsafe_usage
+}
+
+WelsFileHandle* WelsFopen (const char* kpFilename,  const char* kpMode) {
+  return fopen (kpFilename, kpMode);
+}
+
+int32_t WelsFclose (WelsFileHandle*   pFp) {
+  return fclose (pFp);
+}
+
+int32_t WelsGetTimeOfDay (SWelsTime* pTp) {
+  struct timeval  sTv;
+
+  if (gettimeofday (&sTv, NULL)) {
+    return -1;
+  }
+
+  pTp->time = sTv.tv_sec;
+  pTp->millitm = (uint16_t)sTv.tv_usec / 1000;
+
+  return 0;
+}
+
+int32_t WelsStrftime (char* pBuffer, int32_t iSize, const char* kpFormat, const SWelsTime* kpTp) {
+  struct tm*   pTnow;
+  int32_t iRc;
+
+  pTnow = localtime (&kpTp->time);
+
+  iRc = strftime (pBuffer, iSize, kpFormat, pTnow);
+  if (iRc == 0)
+      pBuffer[0] = '\0';
+  return iRc;
+}
+
+#endif
+
+
+char* WelsStrcat (char* pDest, int32_t iSizeInBytes, const char* kpSrc) {
+    int32_t iCurLen = strlen(pDest);
+    return WelsStrncpy(pDest + iCurLen, iSizeInBytes - iCurLen, kpSrc);
+}
+
+int32_t WelsFwrite (const void* kpBuffer, int32_t iSize, int32_t iCount, WelsFileHandle* pFp) {
+  return fwrite (kpBuffer, iSize, iCount, pFp);
+}
+
+uint16_t WelsGetMillisecond (const SWelsTime* kpTp) {
+  return kpTp->millitm;
+}
+
+int32_t WelsFseek (WelsFileHandle* fp, int32_t offset, int32_t origin) {
+  return fseek(fp, offset, origin);
+}
+
+int32_t WelsFflush (WelsFileHandle* pFp) {
+  return fflush (pFp);
+}
--- /dev/null
+++ b/codec/common/src/deblocking_common.cpp
@@ -1,0 +1,204 @@
+#include "deblocking_common.h"
+#include "macros.h"
+//  C code only
+void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
+                         int8_t* pTc) {
+  for (int32_t i = 0; i < 16; i++) {
+    int32_t iTc0 = pTc[i >> 2];
+    if (iTc0 >= 0) {
+      int32_t p0 = pPix[-iStrideX];
+      int32_t p1 = pPix[-2 * iStrideX];
+      int32_t p2 = pPix[-3 * iStrideX];
+      int32_t q0 = pPix[0];
+      int32_t q1 = pPix[iStrideX];
+      int32_t q2 = pPix[2 * iStrideX];
+      bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+      bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+      bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+      int32_t iTc = iTc0;
+      if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+        bool bDetaP2P0 =  WELS_ABS (p2 - p0) < iBeta;
+        bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
+        if (bDetaP2P0) {
+          pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1, -iTc0, iTc0);
+          iTc++;
+        }
+        if (bDetaQ2Q0) {
+          pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1, -iTc0, iTc0);
+          iTc++;
+        }
+        int32_t iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc, iTc);
+        pPix[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
+        pPix[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
+      }
+    }
+    pPix += iStrideY;
+  }
+}
+void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
+  int32_t p0, p1, p2, q0, q1, q2;
+  int32_t iDetaP0Q0;
+  bool bDetaP1P0, bDetaQ1Q0;
+  for (int32_t i = 0; i < 16; i++) {
+    p0 = pPix[-iStrideX];
+    p1 = pPix[-2 * iStrideX];
+    p2 = pPix[-3 * iStrideX];
+    q0 = pPix[0];
+    q1 = pPix[iStrideX];
+    q2 = pPix[2 * iStrideX];
+    iDetaP0Q0 = WELS_ABS (p0 - q0);
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
+      if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
+        bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
+        bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
+        if (bDetaP2P0) {
+          const int32_t p3 = pPix[-4 * iStrideX];
+          pPix[-iStrideX] = (p2 + (p1 << 1) + (p0 << 1) + (q0 << 1) + q1 + 4) >> 3;	   //p0
+          pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2;	 //p1
+          pPix[-3 * iStrideX] = ((p3 << 1) + p2 + (p2 << 1) + p1 + p0 + q0 + 4) >> 3;//p2
+        } else {
+          pPix[-1 * iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;	//p0
+        }
+        if (bDetaQ2Q0) {
+          const int32_t q3 = pPix[3 * iStrideX];
+          pPix[0] = (p1 + (p0 << 1) + (q0 << 1) + (q1 << 1) + q2 + 4) >> 3;   //q0
+          pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2;   //q1
+          pPix[2 * iStrideX] = ((q3 << 1) + q2 + (q2 << 1) + q1 + q0 + p0 + 4) >> 3;//q2
+        } else {
+          pPix[0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
+        }
+      } else {
+        pPix[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;   //p0
+        pPix[ 0] = ((q1 << 1) + q0 + p1 + 2) >> 2;   //q0
+      }
+    }
+    pPix += iStrideY;
+  }
+}
+void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
+  DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
+}
+void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
+  DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
+}
+void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
+}
+void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
+}
+void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                           int32_t iBeta, int8_t* pTc) {
+  int32_t p0, p1, q0, q1, iDeta;
+  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+
+  for (int32_t i = 0; i < 8; i++) {
+    int32_t iTc0 = pTc[i >> 1];
+    if (iTc0 > 0) {
+      p0 = pPixCb[-iStrideX];
+      p1 = pPixCb[-2 * iStrideX];
+      q0 = pPixCb[0];
+      q1 = pPixCb[iStrideX];
+
+      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
+      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
+      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
+        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+        pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
+        pPixCb[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
+      }
+
+
+      p0 = pPixCr[-iStrideX];
+      p1 = pPixCr[-2 * iStrideX];
+      q0 = pPixCr[0];
+      q1 = pPixCr[iStrideX];
+
+      bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
+      bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
+      bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+
+      if (bDetaP0Q0 && bDetaP1P0 &&	bDetaQ1Q0) {
+        iDeta = WELS_CLIP3 ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
+        pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
+        pPixCr[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
+      }
+    }
+    pPixCb += iStrideY;
+    pPixCr += iStrideY;
+  }
+}
+void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
+                           int32_t iBeta) {
+  int32_t p0, p1, q0, q1;
+  bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
+  for (int32_t i = 0; i < 8; i++) {
+    //cb
+    p0 = pPixCb[-iStrideX];
+    p1 = pPixCb[-2 * iStrideX];
+    q0 = pPixCb[0];
+    q1 = pPixCb[iStrideX];
+    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+      pPixCb[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
+      pPixCb[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
+    }
+
+    //cr
+    p0 = pPixCr[-iStrideX];
+    p1 = pPixCr[-2 * iStrideX];
+    q0 = pPixCr[0];
+    q1 = pPixCr[iStrideX];
+    bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
+    bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
+    bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
+    if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
+      pPixCr[-iStrideX] = ((p1 << 1) + p0 + q1 + 2) >> 2;     /* p0' */
+      pPixCr[0]  = ((q1 << 1) + q0 + p1 + 2) >> 2;     /* q0' */
+    }
+    pPixCr += iStrideY;
+    pPixCb += iStrideY;
+  }
+}
+void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* tc) {
+  DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
+}
+void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
+                            int8_t* tc) {
+  DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
+}
+void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
+}
+void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+  DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
+}
+
+#ifdef X86_ASM
+extern "C" {
+  void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
+    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
+    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+  void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
+    ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
+
+    DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+    DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
+    DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
+  }
+
+}
+
+#endif
+
--- /dev/null
+++ b/codec/common/src/logging.cpp
@@ -1,0 +1,49 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     Copyright (c)  2013, Mozilla
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "typedefs.h"
+
+static int32_t g_TraceLevel = 0;
+
+void WelsStderrSetTraceLevel (int32_t level) {
+  g_TraceLevel = level;
+}
+
+int32_t welsStderrLevelTrace (int32_t level, const char* format, va_list ap) {
+  if (level < g_TraceLevel) {
+    vfprintf (stderr, format, ap);
+  }
+  return 0;
+}
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -1,23 +1,23 @@
 COMMON_SRCDIR=codec/common
 COMMON_CPP_SRCS=\
-	$(COMMON_SRCDIR)/cpu.cpp\
-	$(COMMON_SRCDIR)/crt_util_safe_x.cpp\
-	$(COMMON_SRCDIR)/deblocking_common.cpp\
-	$(COMMON_SRCDIR)/logging.cpp\
-	$(COMMON_SRCDIR)/WelsThreadLib.cpp\
+	$(COMMON_SRCDIR)/src/cpu.cpp\
+	$(COMMON_SRCDIR)/src/crt_util_safe_x.cpp\
+	$(COMMON_SRCDIR)/src/deblocking_common.cpp\
+	$(COMMON_SRCDIR)/src/logging.cpp\
+	$(COMMON_SRCDIR)/src/WelsThreadLib.cpp\
 
 COMMON_OBJS += $(COMMON_CPP_SRCS:.cpp=.$(OBJ))
 
 ifeq ($(ASM_ARCH), x86)
 COMMON_ASM_SRCS=\
-	$(COMMON_SRCDIR)/cpuid.asm\
-	$(COMMON_SRCDIR)/deblock.asm\
-	$(COMMON_SRCDIR)/expand_picture.asm\
-	$(COMMON_SRCDIR)/mb_copy.asm\
-	$(COMMON_SRCDIR)/mc_chroma.asm\
-	$(COMMON_SRCDIR)/mc_luma.asm\
-	$(COMMON_SRCDIR)/satd_sad.asm\
-	$(COMMON_SRCDIR)/vaa.asm\
+	$(COMMON_SRCDIR)/x86/cpuid.asm\
+	$(COMMON_SRCDIR)/x86/deblock.asm\
+	$(COMMON_SRCDIR)/x86/expand_picture.asm\
+	$(COMMON_SRCDIR)/x86/mb_copy.asm\
+	$(COMMON_SRCDIR)/x86/mc_chroma.asm\
+	$(COMMON_SRCDIR)/x86/mc_luma.asm\
+	$(COMMON_SRCDIR)/x86/satd_sad.asm\
+	$(COMMON_SRCDIR)/x86/vaa.asm\
 
 COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.$(OBJ))
 endif
@@ -24,9 +24,9 @@
 
 ifeq ($(ASM_ARCH), arm)
 COMMON_ASM_S_SRCS=\
-	$(COMMON_SRCDIR)/deblocking_neon.S\
-	$(COMMON_SRCDIR)/expand_picture_neon.S\
-	$(COMMON_SRCDIR)/mc_neon.S\
+	$(COMMON_SRCDIR)/arm/deblocking_neon.S\
+	$(COMMON_SRCDIR)/arm/expand_picture_neon.S\
+	$(COMMON_SRCDIR)/arm/mc_neon.S\
 
 COMMON_OBJS += $(COMMON_ASM_S_SRCS:.S=.$(OBJ))
 endif
--- a/codec/common/typedefs.h
+++ /dev/null
@@ -1,74 +1,0 @@
-/*!
- * \copy
- *     Copyright (c)  2013, Cisco Systems
- *     All rights reserved.
- *
- *     Redistribution and use in source and binary forms, with or without
- *     modification, are permitted provided that the following conditions
- *     are met:
- *
- *        * Redistributions of source code must retain the above copyright
- *          notice, this list of conditions and the following disclaimer.
- *
- *        * Redistributions in binary form must reproduce the above copyright
- *          notice, this list of conditions and the following disclaimer in
- *          the documentation and/or other materials provided with the
- *          distribution.
- *
- *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- *     POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-// typedef.h
-#ifndef WELS_TYPE_DEFINES_H__
-#define WELS_TYPE_DEFINES_H__
-
-#include <limits.h>
-#include <stddef.h>
-
-////////////////////////////////////////////////////////////////////////////
-// NOTICE : ALL internal implement MUST use the data type defined as below
-//          ONLY except with the interface file !!!!!
-////////////////////////////////////////////////////////////////////////////
-
-#ifndef  _MSC_VER
-
-#define __STDC_FORMAT_MACROS
-#include <stdint.h>
-#include <inttypes.h>
-
-#else
-
-// FIXME:     all singed type should be declared explicit,  for example,  int8_t should be declared as signed char.
-typedef signed char      int8_t  ;
-typedef unsigned char    uint8_t ;
-typedef short            int16_t ;
-typedef unsigned short   uint16_t;
-typedef int              int32_t ;
-typedef unsigned int     uint32_t;
-typedef __int64          int64_t ;
-typedef unsigned __int64 uint64_t;
-#define PRId64 "I64d"
-
-#endif // _MSC_VER defined
-
-// The 'float' type is portable and usable without any need for any extra typedefs.
-
-#ifdef EPSN
-#undef EPSN
-#endif//EPSN
-#define EPSN	  (0.000001f) // (1e-6)	// desired float precision
-
-#endif //WELS_TYPE_DEFINES_H__
-
--- a/codec/common/vaa.asm
+++ /dev/null
@@ -1,411 +1,0 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	vaa.asm
-;*
-;*	Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010	Created
-;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
-;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
-
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [r0    ]	; line 0
-	movdqa %2, [r0+r1]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [r0+r2]	; line 2
-	movdqa %4, [r0+r3]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $04
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [r0    ]	; line 0
-	movdqa %2, [r0+r1]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [r0+r2]	; line 2
-	movdqa %4, [r0+r3]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $04
-%endmacro
-
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-; , 6/7/2010
-
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-
-    %assign push_num 0
-    LOAD_2_PARA
-    PUSH_XMM 8
-    SIGN_EXTENSION r1,r1d
-
-%ifdef X86_32
-    push r3
-    push r4
-    push r5
-    push r6
-    %assign push_num push_num+4
-%endif
-
-    mov  r5,r7
-    and  r5,0fh
-    sub  r7,r5
-    sub  r7,32
-
-
-    mov r2,r1
-    sal r2,$01   ;r2 = 2*iLineSize
-    mov r3,r2
-    add r3,r1   ;r3 = 3*iLineSize
-
-    mov r4,r2
-    sal r4,$01   ;r4 = 4*iLineSize
-
-	pxor xmm7, xmm7
-
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7], xmm0
-
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+8], xmm0
-
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+16], xmm0
-
-	lea r0, [r0+r4]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [r7+24], xmm0
-
-	movdqa xmm0, [r7]		; block 0~7
-	movdqa xmm1, [r7+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-
-
-
-	movd r2d, xmm0
-	and r2, 0ffffh		; effective low work truncated
-	mov r3, r2
-	imul r2, r3
-	sar r2, $04
-	movd retrd, xmm1
-	sub retrd, r2d
-
-	add r7,32
-	add r7,r5
-
-%ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
-%endif
-	POP_XMM
-
-	ret
-
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-
-    %assign push_num 0
-    LOAD_2_PARA
-    PUSH_XMM 8
-    SIGN_EXTENSION r1,r1d
-
-%ifdef X86_32
-    push r3
-    push r4
-    push r5
-    push r6
-    %assign push_num push_num+4
-%endif
-
-    mov  r5,r7
-    and  r5,0fh
-    sub  r7,r5
-    sub  r7,32
-
-
-    mov r2,r1
-    sal r2,$01   ;r2 = 2*iLineSize
-    mov r3,r2
-    add r3,r1   ;r3 = 3*iLineSize
-
-    mov r4,r2
-    sal r4,$01   ;r4 = 4*iLineSize
-
-	pxor xmm7, xmm7
-
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-    movq [r7],xmm0
-
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-    movq [r7+8],xmm1
-
-
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-    movq [r7+16],xmm0
-
-	lea r0,[r0+r4]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-    movq [r7+24],xmm1
-
-
-	movdqa xmm0,[r7]
-	movdqa xmm1,[r7+16]
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-
-
-    movd r2d, xmm0
-    and r2, 0ffffh          ; effective low work truncated
-    mov r3, r2
-    imul r2, r3
-    sar r2, $04
-    movd retrd, xmm1
-	sub retrd, r2d
-
-	add r7,32
-	add r7,r5
-%ifdef X86_32
-	pop r6
-	pop r5
-	pop r4
-	pop r3
-%endif
-	POP_XMM
-
-	ret
-
-;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
-;***********************************************************************
-WELS_EXTERN MdInterAnalysisVaaInfo_sse41
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa xmm0,[r0]
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
-	pshufd xmm4, xmm3, 01Bh
-	paddd xmm4, xmm3
-	pshufd xmm3, xmm4, 0B1h
-	paddd xmm3, xmm4
-	movd r0d, xmm3
-	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
-
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 01Bh
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps retrd, xmm0
-	ret
-.threshold_exit:
-	mov retrd, 15
-	ret
-
-;***********************************************************************
-;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
-;***********************************************************************
-WELS_EXTERN MdInterAnalysisVaaInfo_sse2
-	%assign push_num 0
-	LOAD_1_PARA
-	movdqa xmm0, [r0]
-	pshufd xmm1, xmm0, 01Bh
-	paddd xmm1, xmm0
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	psrad xmm1, 02h		; iAverageSad
-	movdqa xmm2, xmm1
-	psrad xmm2, 06h
-	movdqa xmm3, xmm0	; iSadBlock
-	psrad xmm3, 06h
-	psubd xmm3, xmm2
-
-	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3
-	pmuludq xmm2, xmm3
-	pshufd xmm4, xmm3, 0B1h
-	pmuludq xmm4, xmm4
-	movdqa xmm5, xmm2
-	punpckldq xmm5, xmm4
-	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2
-
-	pshufd xmm4, xmm5, 01Bh
-	paddd xmm4, xmm5
-	pshufd xmm5, xmm4, 0B1h
-	paddd xmm5, xmm4
-
-	movd r0d, xmm5
-	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	jb near .threshold_exit
-	pshufd xmm0, xmm0, 01Bh
-	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
-	movmskps retrd, xmm0
-	ret
-.threshold_exit:
-	mov retrd, 15
-	ret
--- /dev/null
+++ b/codec/common/x86/asm_inc.asm
@@ -1,0 +1,599 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  sse2inc.asm
+;*
+;*  Abstract
+;*      macro and constant
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1
+	%define MOVDQ movdqa
+%else
+	%define MOVDQ movdqu
+%endif
+
+%if 1
+	%define WELSEMMS	emms
+%else
+	%define WELSEMMS
+%endif
+
+
+;***********************************************************************
+; Macros
+;***********************************************************************
+
+DEFAULT REL
+
+%ifdef WIN64 ; Windows x64 ;************************************
+
+BITS 64
+
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + push_num*8 + 40]
+%define arg6 [rsp + push_num*8 + 48]
+%define arg7 [rsp + push_num*8 + 56]
+%define arg8 [rsp + push_num*8 + 64]
+%define arg9 [rsp + push_num*8 + 72]
+%define arg10 [rsp + push_num*8 + 80]
+%define arg11 [rsp + push_num*8 + 88]
+%define arg12 [rsp + push_num*8 + 96]
+
+%define r0 rcx
+%define r1 rdx
+%define r2 r8
+%define r3 r9
+%define r4 rax
+%define r5 r10
+%define r6 r11
+%define r7 rsp
+
+%define r0d ecx
+%define r1d edx
+%define r2d r8d
+%define r3d r9d
+%define r4d eax
+%define r5d r10d
+%define r6d r11d
+
+%define r0w  cx
+%define r1w  dx
+%define r2w  r8w
+%define r3w  r9w
+
+%define r0b  cl
+%define r1b  dl
+%define r2b  r8l
+%define r3b  r9l
+
+%define  PUSHRFLAGS     pushfq
+%define  POPRFLAGS      popfq
+%define  retrq          rax
+%define  retrd          eax
+
+%elifdef UNIX64 ; Unix x64 ;************************************
+
+BITS 64
+
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%define arg6 r9
+%define arg7 [rsp + push_num*8 + 8]
+%define arg8 [rsp + push_num*8 + 16]
+%define arg9 [rsp + push_num*8 + 24]
+%define arg10 [rsp + push_num*8 + 32]
+%define arg11 [rsp + push_num*8 + 40]
+%define arg12 [rsp + push_num*8 + 48]
+
+%define r0 rdi
+%define r1 rsi
+%define r2 rdx
+%define r3 rcx
+%define r4 r8
+%define r5 r9
+%define r6 r10
+%define r7 rsp
+
+%define r0d edi
+%define r1d esi
+%define r2d edx
+%define r3d ecx
+%define r4d r8d
+%define r5d r9d
+%define r6d r10d
+
+%define r0w  di
+%define r1w  si
+%define r2w  dx
+%define r3w  cx
+
+%define r0b  dil
+%define r1b  sil
+%define r2b  dl
+%define r3b  cl
+
+%define  PUSHRFLAGS     pushfq
+%define  POPRFLAGS      popfq
+%define  retrq          rax
+%define  retrd          eax
+
+%elifdef X86_32 ; X86_32 ;************************************
+
+BITS 32
+
+%define arg1 [esp + push_num*4 + 4]
+%define arg2 [esp + push_num*4 + 8]
+%define arg3 [esp + push_num*4 + 12]
+%define arg4 [esp + push_num*4 + 16]
+%define arg5 [esp + push_num*4 + 20]
+%define arg6 [esp + push_num*4 + 24]
+%define arg7 [esp + push_num*4 + 28]
+%define arg8 [esp + push_num*4 + 32]
+%define arg9 [esp + push_num*4 + 36]
+%define arg10 [esp + push_num*4 + 40]
+%define arg11 [esp + push_num*4 + 44]
+%define arg12 [esp + push_num*4 + 48]
+
+%define r0 eax
+%define r1 ecx
+%define r2 edx
+%define r3 ebx
+%define r4 esi
+%define r5 edi
+%define r6 ebp
+%define r7 esp
+
+%define r0d eax
+%define r1d ecx
+%define r2d edx
+%define r3d ebx
+%define r4d esi
+%define r5d edi
+%define r6d ebp
+
+%define r0w ax
+%define r1w cx
+%define r2w dx
+%define r3w bx
+
+%define r0b al
+%define r1b cl
+%define r2b dl
+%define r3b bl
+
+%define  PUSHRFLAGS     pushfd
+%define  POPRFLAGS      popfd
+%define  retrq          eax      ; 32 bit mode do not support 64 bits regesters
+%define  retrd          eax
+
+%endif
+
+%macro LOAD_PARA 2
+    mov %1, %2
+%endmacro
+
+%macro LOAD_1_PARA 0
+    %ifdef X86_32
+	mov r0, [esp + push_num*4 + 4]
+    %endif
+%endmacro
+
+%macro LOAD_2_PARA 0
+    %ifdef X86_32
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+    %endif
+%endmacro
+
+%macro LOAD_3_PARA 0
+    %ifdef X86_32
+        mov r0, [esp + push_num*4 + 4]
+	mov r1, [esp + push_num*4 + 8]
+	mov r2, [esp + push_num*4 + 12]
+    %endif
+%endmacro
+
+%macro LOAD_4_PARA 0
+    %ifdef X86_32
+        push r3
+        %assign  push_num push_num+1
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+    %endif
+%endmacro
+
+%macro LOAD_5_PARA 0
+    %ifdef X86_32
+        push r3
+        push r4
+        %assign  push_num push_num+2
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+        mov r4, [esp + push_num*4 + 20]
+    %elifdef WIN64
+        mov r4, [rsp + push_num*8 + 40]
+    %endif
+%endmacro
+
+%macro LOAD_6_PARA 0
+    %ifdef X86_32
+	push r3
+        push r4
+        push r5
+        %assign  push_num push_num+3
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+        mov r4, [esp + push_num*4 + 20]
+        mov r5, [esp + push_num*4 + 24]
+    %elifdef WIN64
+        mov r4, [rsp + push_num*8 + 40]
+        mov r5, [rsp + push_num*8 + 48]
+    %endif
+%endmacro
+
+%macro LOAD_7_PARA 0
+    %ifdef X86_32
+        push r3
+        push r4
+        push r5
+        push r6
+        %assign  push_num push_num+4
+        mov r0, [esp + push_num*4 + 4]
+        mov r1, [esp + push_num*4 + 8]
+        mov r2, [esp + push_num*4 + 12]
+        mov r3, [esp + push_num*4 + 16]
+        mov r4, [esp + push_num*4 + 20]
+        mov r5, [esp + push_num*4 + 24]
+        mov r6, [esp + push_num*4 + 28]
+    %elifdef WIN64
+        mov r4, [rsp + push_num*8 + 40]
+        mov r5, [rsp + push_num*8 + 48]
+        mov r6, [rsp + push_num*8 + 56]
+    %elifdef UNIX64
+        mov r6, [rsp + push_num*8 + 8]
+    %endif
+%endmacro
+
+
+
+%macro LOAD_4_PARA_POP 0
+    %ifdef X86_32
+	pop r3
+    %endif
+%endmacro
+
+%macro LOAD_5_PARA_POP 0
+    %ifdef X86_32
+        pop r4
+	pop r3
+    %endif
+%endmacro
+
+%macro LOAD_6_PARA_POP 0
+    %ifdef X86_32
+        pop r5
+  	pop r4
+ 	pop r3
+    %endif
+%endmacro
+
+%macro LOAD_7_PARA_POP 0
+    %ifdef X86_32
+        pop r6
+        pop r5
+        pop r4
+        pop r3
+    %endif
+%endmacro
+
+%macro PUSH_XMM 1
+    %ifdef WIN64
+        %assign xmm_num_regs %1
+        %if xmm_num_regs > 6
+            %ifdef push_num
+                %assign push_num push_num+2*(%1-6)
+            %endif
+            sub rsp, 16*(%1 - 6)
+            movdqu [rsp], xmm6
+        %endif
+        %if xmm_num_regs > 7
+            movdqu [rsp+16], xmm7
+        %endif
+        %if xmm_num_regs > 8
+            movdqu [rsp+32], xmm8
+        %endif
+        %if xmm_num_regs > 9
+            movdqu [rsp+48], xmm9
+        %endif
+        %if xmm_num_regs > 10
+            movdqu [rsp+64], xmm10
+        %endif
+        %if xmm_num_regs > 11
+            movdqu [rsp+80], xmm11
+        %endif
+        %if xmm_num_regs > 12
+            movdqu [rsp+96], xmm12
+        %endif
+        %if xmm_num_regs > 13
+            movdqu [rsp+112], xmm13
+        %endif
+        %if xmm_num_regs > 14
+            movdqu [rsp+128], xmm14
+        %endif
+        %if xmm_num_regs > 15
+            movdqu [rsp+144], xmm15
+        %endif
+    %endif
+%endmacro
+
+%macro POP_XMM 0
+    %ifdef WIN64
+        %if xmm_num_regs > 15
+            movdqu xmm15, [rsp+144]
+        %endif
+        %if xmm_num_regs > 14
+            movdqu xmm14, [rsp+128]
+        %endif
+        %if xmm_num_regs > 13
+            movdqu xmm13, [rsp+112]
+        %endif
+        %if xmm_num_regs > 12
+            movdqu xmm12, [rsp+96]
+        %endif
+        %if xmm_num_regs > 11
+            movdqu xmm11, [rsp+80]
+        %endif
+        %if xmm_num_regs > 10
+            movdqu xmm10, [rsp+64]
+        %endif
+        %if xmm_num_regs > 9
+            movdqu xmm9, [rsp+48]
+        %endif
+        %if xmm_num_regs > 8
+            movdqu xmm8, [rsp+32]
+        %endif
+        %if xmm_num_regs > 7
+            movdqu xmm7, [rsp+16]
+        %endif
+        %if xmm_num_regs > 6
+            movdqu xmm6, [rsp]
+            add rsp, 16*(xmm_num_regs - 6)
+        %endif
+    %endif
+%endmacro
+
+%macro SIGN_EXTENSION 2
+    %ifndef X86_32
+            movsxd %1, %2
+    %endif
+%endmacro
+
+%macro SIGN_EXTENSIONW 2
+    %ifndef X86_32
+            movsx %1, %2
+    %endif
+%endmacro
+
+%macro WELS_EXTERN 1
+    ALIGN 16
+    %ifdef PREFIX
+        global _%1
+        %define %1 _%1
+    %else
+        global %1
+    %endif
+    %1:
+%endmacro
+
+%macro WELS_AbsW 2
+	pxor        %2, %2
+    psubw       %2, %1
+    pmaxsw      %1, %2
+%endmacro
+
+%macro MMX_XSwap  4
+    movq		%4, %2
+    punpckh%1   %4, %3
+    punpckl%1   %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+    MMX_XSwap wd, %1, %2, %5
+    MMX_XSwap wd, %3, %4, %2
+    MMX_XSwap dq, %1, %3, %4
+    MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+    movdqa      %4, %2
+    punpckl%1   %2, %3
+    punpckh%1   %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4  pOut:  xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+    SSE2_XSawp dq,  %1, %2, %5
+    SSE2_XSawp dq,  %3, %4, %2
+    SSE2_XSawp qdq, %1, %3, %4
+    SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
+%macro SSE2_TransTwo4x4W 5
+    SSE2_XSawp wd,  %1, %2, %5
+    SSE2_XSawp wd,  %3, %4, %2
+    SSE2_XSawp dq,  %1, %3, %4
+    SSE2_XSawp dq,  %5, %2, %3
+    SSE2_XSawp qdq, %1, %5, %2
+    SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in:  m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+	movdqa	%9,	%8
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%9, %4
+	SSE2_XSawp bw,  %7, %6, %4
+
+	SSE2_XSawp wd,  %1, %3, %6
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %3
+	SSE2_XSawp wd,  %7, %4, %3
+
+	SSE2_XSawp dq,  %1, %5, %4
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %5
+	SSE2_XSawp dq,  %7, %3, %5
+
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %9
+	movdqa	%9, %1
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+    movq         %1, %4
+    punpcklbw    %1, %3
+    movq         %2, %5
+    punpcklbw    %2, %3
+    psubw        %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+	movdqa  %3, %2
+    paddw   %2, %1
+    psubw   %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+	mov %3h, %3l
+	movd %1, e%3x		; i.e, 1% = eax (=b0)
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
+
+;copy a dw into a xmm for 8 times
+%macro  SSE2_Copy8Times 2
+		movd	%1, %2
+		punpcklwd %1, %1
+		pshufd	%1,	%1,	0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro  SSE2_Copy16Times 2
+		movd		%1, %2
+		pshuflw		%1, %1, 0
+		punpcklqdq	%1, %1
+		packuswb	%1,	%1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro	WELS_Zero 1
+	pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+	pcmpeqw %1,%1
+	psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+	pcmpeqw %1,%1
+	psrlw %1,15
+	packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/codec/common/x86/cpuid.asm
@@ -1,0 +1,212 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	cpu_mmx.asm
+;*
+;*  Abstract
+;*		verify cpuid feature support and cpuid detection
+;*
+;*  History
+;*      04/29/2009	Created
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+;******************************************************************************************
+;   int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WELS_EXTERN WelsCPUIdVerify
+    push    r1
+    PUSHRFLAGS
+    PUSHRFLAGS
+
+    pop      r1
+    mov      eax, r1d
+    xor      eax, 00200000h
+    xor      eax, r1d
+    POPRFLAGS
+    pop      r1
+    ret
+
+;****************************************************************************************************
+;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
+;****************************************************************************************************
+%ifdef       WIN64
+
+WELS_EXTERN WelsCPUId
+    push     rbx
+    push     rdx
+
+    mov      eax,     ecx
+    mov      rcx,     [r9]
+    cpuid
+    mov      [r9],    ecx
+    mov      [r8],    ebx
+    mov      rcx,    [rsp + 2*8 + 40]
+    mov      [rcx],   edx
+    pop      rdx
+    mov      [rdx],   eax
+
+    pop      rbx
+    ret
+
+%elifdef     UNIX64
+WELS_EXTERN WelsCPUId
+    push     rbx
+    push     rcx
+    push     rdx
+
+    mov      eax,     edi
+    mov      rcx,     [rcx]
+    cpuid
+    mov      [r8],    edx
+    pop      rdx
+    pop      r8
+    mov      [r8],   ecx
+    mov      [rdx],   ebx
+    mov      [rsi],   eax
+
+    pop      rbx
+    ret
+
+%elifdef     X86_32
+
+WELS_EXTERN WelsCPUId
+    push	ebx
+    push	edi
+
+    mov     eax, [esp+12]	; operating index
+    mov     edi, [esp+24]
+    mov     ecx, [edi]
+    cpuid					; cpuid
+
+    ; processing various information return
+    mov     edi, [esp+16]
+    mov     [edi], eax
+    mov     edi, [esp+20]
+    mov     [edi], ebx
+    mov     edi, [esp+24]
+    mov     [edi], ecx
+    mov     edi, [esp+28]
+    mov     [edi], edx
+
+    pop	    edi
+    pop     ebx
+    ret
+
+%endif
+
+; need call after cpuid=1 and eax, ecx flag got then
+;****************************************************************************************************
+;   int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WELS_EXTERN WelsCPUSupportAVX
+%ifdef     WIN64
+        mov   eax,    ecx
+        mov   ecx,    edx
+%elifdef   UNIX64
+        mov eax, edi
+        mov ecx, esi
+%else
+        mov eax, [esp+4]
+        mov ecx, [esp+8]
+%endif
+
+        ; refer to detection of AVX addressed in INTEL AVX manual document
+        and ecx, 018000000H
+        cmp ecx, 018000000H             ; check both OSXSAVE and AVX feature flags
+        jne avx_not_supported
+        ; processor supports AVX instructions and XGETBV is enabled by OS
+        mov ecx, 0                              ; specify 0 for XFEATURE_ENABLED_MASK register
+        XGETBV                                  ; result in EDX:EAX
+        and eax, 06H
+        cmp eax, 06H                    ; check OS has enabled both XMM and YMM state support
+        jne avx_not_supported
+        mov eax, 1
+        ret
+avx_not_supported:
+        mov eax, 0
+        ret
+
+
+; need call after cpuid=1 and eax, ecx flag got then
+;****************************************************************************************************
+;   int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WELS_EXTERN  WelsCPUSupportFMA
+%ifdef     WIN64
+        mov   eax,   ecx
+        mov   ecx,   edx
+%elifdef   UNIX64
+        mov   eax,   edi
+        mov   ecx,   esi
+%else
+	mov eax, [esp+4]
+	mov ecx, [esp+8]
+%endif
+	; refer to detection of FMA addressed in INTEL AVX manual document
+	and ecx, 018001000H
+	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
+	jne fma_not_supported
+	; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+	mov ecx, 0				; specify 0 for XFEATURE_ENABLED_MASK register
+	XGETBV					; result in EDX:EAX
+	and eax, 06H
+	cmp eax, 06H			; check OS has enabled both XMM and YMM state support
+	jne fma_not_supported
+	mov eax, 1
+	ret
+fma_not_supported:
+	mov eax, 0
+	ret
+
+;******************************************************************************************
+;   void WelsEmms()
+;******************************************************************************************
+WELS_EXTERN WelsEmms
+	emms	; empty mmx technology states
+	ret
+
--- /dev/null
+++ b/codec/common/x86/deblock.asm
@@ -1,0 +1,5278 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+SECTION .rodata align=16
+
+ALIGN   16
+FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4
+
+
+SECTION .text
+
+%ifdef  WIN64
+
+
+WELS_EXTERN   DeblockLumaLt4V_ssse3
+  push        rbp
+  mov         r11,[rsp + 16 + 20h]  ; pTC
+  PUSH_XMM 16
+  sub         rsp,1B0h
+  lea         rbp,[rsp+20h]
+  movd        xmm4,r8d
+  movd        xmm2,r9d
+  mov         qword [rbp+180h],r12
+  mov         r10,rcx
+  movsxd      r12,edx
+  add         edx,edx
+  movsxd      rdx,edx
+  sub         r10,r12
+  movsx       r8d,byte [r11]
+  pxor        xmm3,xmm3
+  punpcklwd   xmm2,xmm2
+  movaps      [rbp+50h],xmm14
+  lea         rax,[r12+r12*2]
+  movdqa      xmm14,[rdx+rcx]
+  neg         rax
+  pshufd      xmm0,xmm2,0
+  movd        xmm2,r8d
+  movsx       edx,byte [r11+1]
+  movsx       r8d,byte [r11+2]
+  movsx       r11d,byte [r11+3]
+  movaps      [rbp+70h],xmm12
+  movd        xmm1,edx
+  movaps      [rbp+80h],xmm11
+  movd        xmm12,r8d
+  movd        xmm11,r11d
+  movdqa      xmm5, [rax+rcx]
+  lea         rax,[r12+r12]
+  punpcklwd   xmm12,xmm12
+  neg         rax
+  punpcklwd   xmm11,xmm11
+  movaps      [rbp],xmm8
+  movdqa      xmm8, [r10]
+  punpcklwd   xmm2,xmm2
+  punpcklwd   xmm1,xmm1
+  punpcklqdq  xmm12,xmm12
+  punpcklqdq  xmm11,xmm11
+  punpcklqdq  xmm2,xmm2
+  punpcklqdq  xmm1,xmm1
+  shufps      xmm12,xmm11,88h
+  movdqa      xmm11,xmm8
+  movaps      [rbp+30h],xmm9
+  movdqa      xmm9,[rcx]
+  shufps      xmm2,xmm1,88h
+  movdqa      xmm1,xmm5
+  punpcklbw   xmm11,xmm3
+  movaps      [rbp+20h],xmm6
+  movaps      [rbp+60h],xmm13
+  movdqa      xmm13,xmm11
+  movaps      [rbp+90h],xmm10
+  movdqa      xmm10,xmm9
+  movdqa      xmm6,[rax+rcx]
+  punpcklbw   xmm1,xmm3
+  movaps      [rbp+0A0h],xmm12
+  psubw       xmm13,xmm1
+  movaps      [rbp+40h],xmm15
+  movdqa      xmm15,xmm14
+  movaps      [rbp+10h],xmm7
+  movdqa      xmm7,xmm6
+  punpcklbw   xmm10,xmm3
+  movdqa      xmm12,[r12+rcx]
+  punpcklbw   xmm7,xmm3
+  punpcklbw   xmm12,xmm3
+  punpcklbw   xmm15,xmm3
+  pabsw       xmm3,xmm13
+  movdqa      xmm13,xmm10
+  psubw       xmm13,xmm15
+  movdqa      [rbp+0F0h],xmm15
+  pabsw       xmm15,xmm13
+  movdqa      xmm13,xmm11
+  movdqa      [rbp+0B0h],xmm1
+  movdqa      xmm1,xmm0
+  pavgw       xmm13,xmm10
+  pcmpgtw     xmm1,xmm3
+  movdqa      [rbp+120h],xmm13
+  movaps      xmm13,xmm2
+  punpcklwd   xmm4,xmm4
+  movdqa      xmm3,xmm0
+  movdqa      [rbp+100h],xmm1
+  psubw       xmm13,xmm1
+  movdqa      xmm1,xmm10
+  pcmpgtw     xmm3,xmm15
+  pshufd      xmm4,xmm4,0
+  psubw       xmm1,xmm11
+  movdqa      [rbp+0D0h],xmm10
+  psubw       xmm13,xmm3
+  movdqa      [rbp+110h],xmm3
+  pabsw       xmm15,xmm1
+  movdqa      xmm3,xmm4
+  psubw       xmm10,xmm12
+  pcmpgtw     xmm3,xmm15
+  pabsw       xmm15,xmm10
+  movdqa      xmm10,xmm0
+  psllw       xmm1,2
+  movdqa      [rbp+0C0h],xmm11
+  psubw       xmm11,xmm7
+  pcmpgtw     xmm10,xmm15
+  pabsw       xmm11,xmm11
+  movdqa      xmm15,xmm0
+  pand        xmm3,xmm10
+  pcmpgtw     xmm15,xmm11
+  movaps      xmm11,xmm2
+  pxor        xmm10,xmm10
+  pand        xmm3,xmm15
+  pcmpgtw     xmm11,xmm10
+  pcmpeqw     xmm10,xmm2
+  por         xmm11,xmm10
+  pand        xmm3,xmm11
+  movdqa      xmm11,xmm7
+  psubw       xmm11,xmm12
+  pxor        xmm15,xmm15
+  paddw       xmm11,xmm1
+  psubw       xmm15,xmm13
+  movdqa      [rbp+0E0h],xmm12
+  paddw       xmm11,[FOUR_16B_SSE2]
+  pxor        xmm12,xmm12
+  psraw       xmm11,3
+  punpckhbw   xmm8,xmm12
+  pmaxsw      xmm15,xmm11
+  punpckhbw   xmm5,xmm12
+  movdqa      xmm11,xmm8
+  pminsw      xmm13,xmm15
+  psubw       xmm11,xmm5
+  punpckhbw   xmm9,xmm12
+  pand        xmm13,xmm3
+  movdqa      [rbp+130h],xmm13
+  pabsw       xmm13,xmm11
+  punpckhbw   xmm14,xmm12
+  movdqa      xmm11,xmm9
+  psubw       xmm11,xmm14
+  movdqa      xmm15,xmm0
+  movdqa      [rbp+140h],xmm14
+  pabsw       xmm14,xmm11
+  movdqa      xmm11,xmm8
+  pcmpgtw     xmm15,xmm14
+  movdqa      xmm1,[r12+rcx]
+  pavgw       xmm11,xmm9
+  movdqa      [rbp+170h],xmm11
+  movdqa      xmm10,xmm9
+  punpckhbw   xmm6,xmm12
+  psubw       xmm10,xmm8
+  punpckhbw   xmm1,xmm12
+  movdqa      xmm12,xmm0
+  movaps      xmm11,[rbp+0A0h]
+  pcmpgtw     xmm12,xmm13
+  movaps      xmm13,xmm11
+  psubw       xmm13,xmm12
+  movdqa      [rbp+160h],xmm15
+  psubw       xmm13,xmm15
+  movdqa      xmm15,xmm9
+  psubw       xmm15,xmm1
+  movdqa      [rbp+150h],xmm12
+  pabsw       xmm12,xmm10
+  pabsw       xmm14,xmm15
+  movdqa      xmm15,xmm8
+  pcmpgtw     xmm4,xmm12
+  movdqa      xmm12,xmm0
+  psubw       xmm15,xmm6
+  pcmpgtw     xmm12,xmm14
+  pabsw       xmm14,xmm15
+  psllw       xmm10,2
+  pcmpgtw     xmm0,xmm14
+  movdqa      xmm14,xmm6
+  psubw       xmm14,xmm1
+  pand        xmm4,xmm12
+  paddw       xmm14,xmm10
+  pand        xmm4,xmm0
+  paddw       xmm14,[FOUR_16B_SSE2]
+  pxor        xmm15,xmm15
+  movaps      xmm12,xmm11
+  psubw       xmm15,xmm13
+  pxor        xmm0,xmm0
+  psraw       xmm14,3
+  pcmpgtw     xmm12,xmm0
+  pcmpeqw     xmm0,xmm11
+  pmaxsw      xmm15,xmm14
+  por         xmm12,xmm0
+  movdqa      xmm0,[rbp+120h]
+  pminsw      xmm13,xmm15
+  movdqa      xmm15,[rbp+0B0h]
+  movdqa      xmm10,xmm7
+  pand        xmm4,xmm12
+  paddw       xmm15,xmm0
+  pxor        xmm12,xmm12
+  paddw       xmm10,xmm7
+  movdqa      xmm14,xmm12
+  psubw       xmm15,xmm10
+  psubw       xmm14,xmm2
+  psraw       xmm15,1
+  pmaxsw      xmm15,xmm14
+  movdqa      xmm10,xmm6
+  pminsw      xmm15,xmm2
+  paddw       xmm10,xmm6
+  pand        xmm15,xmm3
+  psubw       xmm12,xmm11
+  pand        xmm15,[rbp+100h]
+  pand        xmm13,xmm4
+  paddw       xmm7,xmm15
+  paddw       xmm8,xmm13
+  movdqa      xmm15,[rbp+170h]
+  psubw       xmm9,xmm13
+  paddw       xmm5,xmm15
+  psubw       xmm5,xmm10
+  psraw       xmm5,1
+  pmaxsw      xmm5,xmm12
+  pminsw      xmm5,xmm11
+  pand        xmm5,xmm4
+  pand        xmm5,[rbp+150h]
+  paddw       xmm6,xmm5
+  movdqa      xmm5,[rbp+0C0h]
+  packuswb    xmm7,xmm6
+  movdqa      xmm6,[rbp+130h]
+  paddw       xmm5,xmm6
+  packuswb    xmm5,xmm8
+  movdqa      xmm8,[rbp+0D0h]
+  psubw       xmm8,xmm6
+  movdqa      xmm6,[rbp+0F0h]
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[rbp+0E0h]
+  packuswb    xmm8,xmm9
+  movdqa      xmm9,xmm0
+  paddw       xmm9,xmm0
+  psubw       xmm6,xmm9
+  psraw       xmm6,1
+  pmaxsw      xmm14,xmm6
+  pminsw      xmm2,xmm14
+  pand        xmm2,xmm3
+  pand        xmm2,[rbp+110h]
+  paddw       xmm0,xmm2
+  movdqa      xmm2,[rbp+140h]
+  paddw       xmm2,xmm15
+  movdqa      xmm15,xmm1
+  paddw       xmm15,xmm1
+  psubw       xmm2,xmm15
+  psraw       xmm2,1
+  pmaxsw      xmm12,xmm2
+  pminsw      xmm11,xmm12
+  pand        xmm11,xmm4
+  pand        xmm11,[rbp+160h]
+  paddw       xmm1,xmm11
+  movdqa      [rax+rcx],xmm7
+  movdqa      [r10],xmm5
+  packuswb    xmm0,xmm1
+  movdqa      [rcx],xmm8
+  movdqa      [r12+rcx],xmm0
+  mov         r12,qword [rbp+180h]
+  lea         rsp,[rbp+190h]
+  POP_XMM
+  pop         rbp
+  ret
+
+
+WELS_EXTERN   DeblockLumaEq4V_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        rsi
+  push        rdi
+  sub         rsp,1D8h
+  movaps      [rax-38h],xmm6
+  movaps      [rax-48h],xmm7
+  movaps      [rax-58h],xmm8
+  pxor        xmm1,xmm1
+  movsxd      r10,edx
+  mov         rbp,rcx
+  mov         r11d,r8d
+  mov         rdx,rcx
+  mov         rdi,rbp
+  mov         rbx,rbp
+  movdqa      xmm5,[rbp]
+  movaps      [rax-68h],xmm9
+  movaps      [rax-78h],xmm10
+  punpcklbw   xmm5,xmm1
+  movaps      [rax-88h],xmm11
+  movaps      [rax-98h],xmm12
+  movaps      [rax-0A8h],xmm13
+  movaps      [rax-0B8h],xmm14
+  movdqa      xmm14,[r10+rbp]
+  movaps      [rax-0C8h],xmm15
+  lea         eax,[r10*4]
+  movsxd      r8,eax
+  lea         eax,[r10+r10*2]
+  movsxd      rcx,eax
+  lea         eax,[r10+r10]
+  sub         rdx,r8
+  punpcklbw   xmm14,xmm1
+  movdqa      [rsp+90h],xmm5
+  movdqa      [rsp+30h],xmm14
+  movsxd      rsi,eax
+  movsx       eax,r11w
+  sub         rdi,rcx
+  sub         rbx,rsi
+  mov         r8,rbp
+  sub         r8,r10
+  movd        xmm0,eax
+  movsx       eax,r9w
+  movdqa      xmm12,[rdi]
+  movdqa      xmm6, [rsi+rbp]
+  movdqa      xmm13,[rbx]
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm6,xmm1
+  movdqa      xmm8,[r8]
+  movd        xmm0,eax
+  movdqa      xmm10,xmm11
+  mov         eax,2
+  punpcklbw   xmm8,xmm1
+  punpcklbw   xmm12,xmm1
+  cwde
+  punpcklwd   xmm0,xmm0
+  psraw       xmm10,2
+  movdqa      xmm1,xmm8
+  movdqa      [rsp+0F0h],xmm13
+  movdqa      [rsp+0B0h],xmm8
+  pshufd      xmm7,xmm0,0
+  psubw       xmm1,xmm13
+  movdqa      xmm0,xmm5
+  movdqa      xmm4,xmm7
+  movdqa      xmm2,xmm7
+  psubw       xmm0,xmm8
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm5
+  movdqa      [rsp+40h],xmm7
+  movdqa      [rsp+60h],xmm6
+  pcmpgtw     xmm4,xmm0
+  psubw       xmm1,xmm14
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm4,xmm2
+  movdqa      xmm0,xmm11
+  pcmpgtw     xmm0,xmm3
+  pand        xmm4,xmm0
+  movd        xmm0,eax
+  movdqa      [rsp+20h],xmm4
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm2,xmm0,0
+  paddw       xmm10,xmm2
+  movdqa      [rsp+0A0h],xmm2
+  movdqa      xmm15,xmm7
+  pxor        xmm4,xmm4
+  movdqa      xmm0,xmm8
+  psubw       xmm0,xmm12
+  mov         eax,4
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm10
+  cwde
+  pcmpgtw     xmm15,xmm0
+  pcmpgtw     xmm1,xmm3
+  movdqa      xmm3,xmm7
+  movdqa      xmm7,[rdx]
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm6
+  pand        xmm15,xmm1
+  punpcklbw   xmm7,xmm4
+  movdqa      xmm9,xmm15
+  pabsw       xmm0,xmm0
+  psllw       xmm7,1
+  pandn       xmm9,xmm12
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm7,xmm12
+  movd        xmm0,eax
+  pand        xmm3,xmm1
+  paddw       xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  paddw       xmm7,xmm12
+  pshufd      xmm1,xmm0,0
+  paddw       xmm7,xmm13
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm6
+  paddw       xmm7,xmm8
+  movdqa      [rsp+70h],xmm1
+  paddw       xmm7,xmm5
+  movdqa      [rsp+120h],xmm0
+  movdqa      xmm0,[rcx+rbp]
+  punpcklbw   xmm0,xmm4
+  paddw       xmm7,xmm1
+  movdqa      xmm4,xmm15
+  psllw       xmm0,1
+  psraw       xmm7,3
+  paddw       xmm0,xmm6
+  pand        xmm7,xmm15
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm14
+  movdqa      xmm6,xmm15
+  paddw       xmm0,xmm5
+  pandn       xmm6,xmm13
+  paddw       xmm0,xmm8
+  paddw       xmm0,xmm1
+  psraw       xmm0,3
+  movdqa      xmm1,xmm12
+  paddw       xmm1,xmm13
+  pand        xmm0,xmm3
+  movdqa      [rsp+100h],xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,xmm5
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm3
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pandn       xmm0,xmm14
+  pand        xmm4,xmm1
+  movdqa      [rsp+0E0h],xmm0
+  movdqa      xmm0,xmm5
+  paddw       xmm0,xmm8
+  movdqa      xmm1,[rsp+60h]
+  paddw       xmm1,xmm14
+  movdqa      xmm14,xmm3
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,[rsp+30h]
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pand        xmm14,xmm1
+  movdqa      xmm1,xmm13
+  paddw       xmm1,xmm13
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  movdqa      xmm0,[rsp+30h]
+  movdqa      xmm2,xmm13
+  movdqa      xmm5,xmm15
+  paddw       xmm0,[rsp+70h]
+  pandn       xmm5,xmm1
+  paddw       xmm2,xmm8
+  movdqa      xmm8,[rsp+90h]
+  movdqa      xmm1,xmm12
+  paddw       xmm2,xmm8
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,xmm8
+  movdqa      xmm8,xmm3
+  movdqa      xmm2,[rsp+30h]
+  paddw       xmm0,xmm13
+  psraw       xmm1,3
+  pand        xmm15,xmm1
+  movdqa      xmm1,xmm2
+  paddw       xmm1,xmm2
+  paddw       xmm2,[rsp+90h]
+  paddw       xmm2,[rsp+0B0h]
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  movdqa      xmm13,[r8]
+  paddw       xmm0, [rsp+70h]
+  paddw       xmm1, [rsp+0A0h]
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  psraw       xmm1,2
+  movdqa      xmm0, [rdi]
+  pandn       xmm8,xmm1
+  movdqa      xmm1, [rsp+60h]
+  paddw       xmm1,xmm2
+  movdqa      xmm2, [rbx]
+  psraw       xmm1,3
+  pand        xmm3,xmm1
+  movdqa      xmm1, [rbp]
+  movdqa      [rsp+0D0h],xmm3
+  pxor        xmm3,xmm3
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm1,xmm3
+  punpckhbw   xmm13,xmm3
+  movdqa      [rsp+0C0h],xmm0
+  movdqa      xmm0,[r10+rbp]
+  movdqa      [rsp],xmm1
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm2,xmm3
+  movdqa      [rsp+80h],xmm0
+  movdqa      xmm0,[rsi+rbp]
+  movdqa      [rsp+10h],xmm13
+  punpckhbw   xmm0,xmm3
+  movdqa      [rsp+50h],xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm1,xmm13
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm2
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,[rsp]
+  movdqa      xmm13,[rsp+40h]
+  movdqa      [rsp+110h],xmm2
+  psubw       xmm1, [rsp+80h]
+  pcmpgtw     xmm13,xmm0
+  pcmpgtw     xmm11,xmm3
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm10,xmm3
+  movdqa      xmm1, [rsp+40h]
+  movdqa      xmm2,xmm1
+  movdqa      xmm3,xmm1
+  pcmpgtw     xmm2,xmm0
+  movdqa      xmm0, [rsp+10h]
+  pand        xmm13,xmm2
+  pand        xmm13,xmm11
+  movdqa      xmm11,[rsp+0C0h]
+  psubw       xmm0,xmm11
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm3,xmm0
+  pand        xmm3,xmm10
+  movdqa      xmm0,[rsp]
+  psubw       xmm0,[rsp+50h]
+  movdqa      xmm2,[rdx]
+  pabsw       xmm0,xmm0
+  por         xmm7,xmm9
+  movdqa      xmm9,[rsp+20h]
+  pcmpgtw     xmm1,xmm0
+  pand        xmm9,xmm7
+  movdqa      xmm7,[rsp+20h]
+  movdqa      xmm0,xmm7
+  pandn       xmm0,xmm12
+  movdqa      xmm12,[rsp+110h]
+  pand        xmm1,xmm10
+  movdqa      xmm10,[rsp+70h]
+  movdqa      [rsp+40h],xmm1
+  movdqa      xmm1,xmm13
+  por         xmm9,xmm0
+  pxor        xmm0,xmm0
+  por         xmm4,xmm6
+  movdqa      xmm6,xmm7
+  punpckhbw   xmm2,xmm0
+  por         xmm15,xmm5
+  movdqa      xmm5,[rsp+20h]
+  movdqa      xmm0,xmm3
+  psllw       xmm2,1
+  pandn       xmm0,xmm11
+  pand        xmm6,xmm4
+  movdqa      xmm4,[rsp]
+  paddw       xmm2,xmm11
+  pand        xmm5,xmm15
+  movdqa      xmm15,[rsp+20h]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm12
+  paddw       xmm2,[rsp+10h]
+  paddw       xmm2,[rsp]
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  pand        xmm2,xmm3
+  por         xmm2,xmm0
+  pand        xmm1,xmm2
+  movdqa      xmm0,xmm13
+  movdqa      xmm2,xmm11
+  pandn       xmm0,xmm11
+  paddw       xmm2,xmm12
+  por         xmm1,xmm0
+  packuswb    xmm9,xmm1
+  movdqa      xmm0,xmm7
+  movdqa      xmm7,[rsp+0A0h]
+  pandn       xmm0,[rsp+0F0h]
+  movdqa      xmm1,xmm3
+  por         xmm6,xmm0
+  movdqa      xmm0,[rsp+10h]
+  paddw       xmm0,xmm4
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm12
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  pandn       xmm0,xmm12
+  movdqa      xmm1,xmm12
+  paddw       xmm1,[rsp+10h]
+  por         xmm2,xmm0
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+0B0h]
+  paddw       xmm1,xmm4
+  packuswb    xmm6,xmm2
+  movdqa      xmm2,xmm3
+  psllw       xmm1,1
+  por         xmm5,xmm0
+  movdqa      xmm0,[rsp+80h]
+  paddw       xmm0,xmm10
+  paddw       xmm1,xmm0
+  paddw       xmm11,xmm1
+  psraw       xmm11,3
+  movdqa      xmm1,xmm12
+  pand        xmm2,xmm11
+  paddw       xmm1,xmm12
+  movdqa      xmm11,[rsp+80h]
+  movdqa      xmm0, [rsp+10h]
+  por         xmm14,[rsp+0E0h]
+  paddw       xmm0,xmm11
+  movdqa      xmm4,xmm15
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  paddw       xmm1,xmm7
+  psraw       xmm1,2
+  pandn       xmm3,xmm1
+  por         xmm2,xmm3
+  movdqa      xmm1,xmm13
+  movdqa      xmm3,[rsp+10h]
+  pandn       xmm0,xmm3
+  pand        xmm1,xmm2
+  movdqa      xmm2,xmm11
+  paddw       xmm2,[rsp]
+  por         xmm1,xmm0
+  movdqa      xmm0,[rsp+0D0h]
+  por         xmm0,xmm8
+  paddw       xmm2,xmm3
+  packuswb    xmm5,xmm1
+  movdqa      xmm8,[rsp+40h]
+  movdqa      xmm1,[rsp+50h]
+  movdqa      xmm3,xmm8
+  pand        xmm4,xmm0
+  psllw       xmm2,1
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+90h]
+  por         xmm4,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm10
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,[rsp]
+  movdqa      xmm2,xmm11
+  paddw       xmm0,xmm12
+  movdqa      xmm12,[rsp]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm0
+  psraw       xmm1,3
+  movdqa      xmm0,xmm8
+  pand        xmm3,xmm1
+  paddw       xmm2,xmm7
+  movdqa      xmm1,xmm13
+  psraw       xmm2,2
+  pandn       xmm0,xmm2
+  por         xmm3,xmm0
+  movdqa      xmm2,[rsp+50h]
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm3
+  paddw       xmm2,xmm11
+  movdqa      xmm3,xmm15
+  por         xmm1,xmm0
+  pand        xmm3,xmm14
+  movdqa      xmm14,[rsp+10h]
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+30h]
+  packuswb    xmm4,xmm1
+  movdqa      xmm1,xmm8
+  por         xmm3,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm14
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm8
+  pandn       xmm0,xmm11
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm11
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm15
+  por         xmm2,xmm0
+  packuswb    xmm3,xmm2
+  movdqa      xmm0,[rsp+100h]
+  por         xmm0,[rsp+120h]
+  pand        xmm1,xmm0
+  movdqa      xmm2,[rcx+rbp]
+  movdqa      xmm7,[rsp+50h]
+  pandn       xmm15,[rsp+60h]
+  lea         r11,[rsp+1D8h]
+  pxor        xmm0,xmm0
+  por         xmm1,xmm15
+  movaps      xmm15,[r11-0A8h]
+  movdqa      [rdi],xmm9
+  movaps      xmm9,[r11-48h]
+  punpckhbw   xmm2,xmm0
+  psllw       xmm2,1
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm7
+  movdqa      [rbx],xmm6
+  movaps      xmm6,[r11-18h]
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm11
+  movaps      xmm11,[r11-68h]
+  paddw       xmm2,xmm12
+  movaps      xmm12,[r11-78h]
+  paddw       xmm2,xmm14
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  movaps      xmm10,[r11-58h]
+  movaps      xmm14,[r11-98h]
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm8
+  pandn       xmm8,xmm7
+  pandn       xmm13,xmm7
+  por         xmm2,xmm8
+  movaps      xmm7,[r11-28h]
+  movaps      xmm8,[r11-38h]
+  movdqa      [r8],xmm5
+  pand        xmm0,xmm2
+  por         xmm0,xmm13
+  packuswb    xmm1,xmm0
+  movaps      xmm13,[r11-88h]
+  movdqa      [rbp],xmm4
+  movdqa      [r10+rbp],xmm3
+  movdqa      [rsi+rbp],xmm1
+  mov         rsp,r11
+  pop         rdi
+  pop         rsi
+  pop         rbp
+  pop         rbx
+  ret
+
+
+WELS_EXTERN  DeblockChromaLt4V_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rdi
+  PUSH_XMM 16
+  sub         rsp,0C8h
+  mov         r10,qword [rax + 30h]  ; pTC
+  pxor        xmm1,xmm1
+  mov         rbx,rcx
+  movsxd      r11,r8d
+  movsx       ecx,byte [r10]
+  movsx       r8d,byte [r10+2]
+  mov         rdi,rdx
+  movq        xmm2,[rbx]
+  movq        xmm9,[r11+rbx]
+  movsx       edx,byte [r10+1]
+  mov         word [rsp+2],cx
+  mov         word [rsp],cx
+  movsx       eax,byte [r10+3]
+  mov         word [rsp+6],dx
+  mov         word [rsp+4],dx
+  movdqa      xmm11,xmm1
+  mov         word [rsp+0Eh],ax
+  mov         word [rsp+0Ch],ax
+  lea         eax,[r11+r11]
+  movsxd      rcx,eax
+  mov         rax,rbx
+  mov         rdx,rdi
+  sub         rax,rcx
+  mov         word [rsp+0Ah],r8w
+  mov         word [rsp+8],r8w
+  movdqa      xmm6,[rsp]
+  movdqa      xmm7,xmm6
+  movq        xmm13, [rax]
+  mov         rax,rdi
+  sub         rax,rcx
+  mov         rcx,rbx
+  pcmpgtw     xmm7,xmm1
+  psubw       xmm11,xmm6
+  sub         rcx,r11
+  sub         rdx,r11
+  movq        xmm0,[rax]
+  movsx       eax,r9w
+  movq        xmm15,[rcx]
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rdx]
+  movdqa      xmm4,xmm13
+  punpcklqdq  xmm15,xmm0
+  movq        xmm0, [rdi]
+  punpcklbw   xmm4,xmm1
+  movdqa      xmm12,xmm15
+  punpcklqdq  xmm2,xmm0
+  movq        xmm0, [r11+rdi]
+  punpcklbw   xmm12,xmm1
+  movdqa      xmm14,xmm2
+  punpcklqdq  xmm9,xmm0
+  punpckhbw   xmm2,xmm1
+  punpcklbw   xmm14,xmm1
+  movd        xmm0,eax
+  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
+  punpckhbw   xmm13,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm9
+  movdqa      [rsp+10h],xmm2
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm9,xmm1
+  punpcklbw   xmm3,xmm1
+  movdqa      xmm1,xmm14
+  pshufd      xmm10,xmm0,0
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm8,xmm0,0
+  movd        xmm0,eax
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  psubw       xmm1,xmm12
+  movdqa      xmm2,xmm10
+  lea         r11,[rsp+0C8h]
+  psllw       xmm1,2
+  movdqa      xmm0,xmm4
+  psubw       xmm4,xmm12
+  psubw       xmm0,xmm3
+  psubw       xmm3,xmm14
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm11
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm12
+  psubw       xmm0,xmm14
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  movdqa      xmm3,[rsp]
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm9
+  psubw       xmm13,xmm15
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  paddw       xmm12,xmm6
+  psubw       xmm14,xmm6
+  movdqa      xmm2,[rsp+10h]
+  movaps      xmm6,[r11-18h]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm15
+  psubw       xmm9,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm15
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  pmaxsw      xmm11,xmm1
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm10,xmm0
+  pabsw       xmm0,xmm13
+  pminsw      xmm3,xmm11
+  movaps      xmm11,[r11-68h]
+  movaps      xmm13,[rsp+40h]
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm9
+  movaps      xmm9, [r11-48h]
+  pand        xmm10,xmm1
+  pcmpgtw     xmm8,xmm0
+  pand        xmm10,xmm8
+  pand        xmm10,xmm7
+  movaps      xmm8,[r11-38h]
+  movaps      xmm7,[r11-28h]
+  pand        xmm3,xmm10
+  paddw       xmm15,xmm3
+  psubw       xmm2,xmm3
+  movaps      xmm10,[r11-58h]
+  packuswb    xmm12,xmm15
+  movaps      xmm15,[rsp+20h]
+  packuswb    xmm14,xmm2
+  movq        [rcx],xmm12
+  movq        [rbx],xmm14
+  psrldq      xmm12,8
+  psrldq      xmm14,8
+  movq        [rdx],xmm12
+  movaps      xmm12,[r11-78h]
+  movq        [rdi],xmm14
+  movaps      xmm14,[rsp+30h]
+  mov         rsp,r11
+  POP_XMM
+  pop         rdi
+  pop         rbx
+  ret
+
+
+WELS_EXTERN   DeblockChromaEq4V_ssse3
+  mov         rax,rsp
+  push        rbx
+  PUSH_XMM 15
+  sub         rsp,90h
+  pxor        xmm1,xmm1
+  mov         r11,rcx
+  mov         rbx,rdx
+  mov         r10d,r9d
+  movq        xmm13,[r11]
+  lea         eax,[r8+r8]
+  movsxd      r9,eax
+  mov         rax,rcx
+  sub         rax,r9
+  movq        xmm14,[rax]
+  mov         rax,rdx
+  sub         rax,r9
+  movq        xmm0,[rax]
+  movsxd      rax,r8d
+  sub         rcx,rax
+  sub         rdx,rax
+  movq        xmm12,[rax+r11]
+  movq        xmm10,[rcx]
+  punpcklqdq  xmm14,xmm0
+  movdqa      xmm8,xmm14
+  movq        xmm0,[rdx]
+  punpcklbw   xmm8,xmm1
+  punpckhbw   xmm14,xmm1
+  punpcklqdq  xmm10,xmm0
+  movq        xmm0,[rbx]
+  movdqa      xmm5,xmm10
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rax+rbx]
+  punpcklbw   xmm5,xmm1
+  movsx       eax,r10w
+  movdqa      xmm9,xmm13
+  punpcklqdq  xmm12,xmm0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm10,xmm1
+  movd        xmm0,eax
+  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
+  punpckhbw   xmm13,xmm1
+  movdqa      xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm12,xmm1
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm7,xmm1
+  movd        xmm0,eax
+  movdqa      xmm1,xmm8
+  psubw       xmm1,xmm5
+  punpcklwd   xmm0,xmm0
+  movdqa      xmm6,xmm11
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm9
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm10
+  movdqa      xmm1,xmm14
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm10
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm11,xmm0
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm11,xmm2
+  movdqa      xmm0,xmm12
+  movdqa      xmm4,xmm6
+  movdqa      xmm1,xmm8
+  mov         eax,2
+  cwde
+  paddw       xmm1,xmm8
+  psubw       xmm0,xmm13
+  paddw       xmm1,xmm5
+  pabsw       xmm0,xmm0
+  movdqa      xmm2,xmm14
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm14
+  movd        xmm0,eax
+  pand        xmm11,xmm3
+  paddw       xmm7,xmm7
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  paddw       xmm2,xmm12
+  paddw       xmm12,xmm12
+  pshufd      xmm3,xmm0,0
+  paddw       xmm7,xmm9
+  paddw       xmm12,xmm13
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm5
+  paddw       xmm7,xmm8
+  psraw       xmm1,2
+  paddw       xmm12,xmm14
+  paddw       xmm7,xmm3
+  movaps      xmm14,[rsp]
+  pand        xmm4,xmm1
+  paddw       xmm12,xmm3
+  psraw       xmm7,2
+  movdqa      xmm1,xmm11
+  por         xmm4,xmm0
+  psraw       xmm12,2
+  paddw       xmm2,xmm3
+  movdqa      xmm0,xmm11
+  pandn       xmm0,xmm10
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  packuswb    xmm4,xmm1
+  movdqa      xmm0,xmm11
+  movdqa      xmm1,xmm6
+  pand        xmm1,xmm7
+  movaps      xmm7,[rsp+70h]
+  movq        [rcx],xmm4
+  pandn       xmm6,xmm9
+  pandn       xmm11,xmm13
+  pand        xmm0,xmm12
+  por         xmm1,xmm6
+  por         xmm0,xmm11
+  psrldq      xmm4,8
+  packuswb    xmm1,xmm0
+  movq        [r11],xmm1
+  psrldq      xmm1,8
+  movq        [rdx],xmm4
+  lea         r11,[rsp+90h]
+  movaps      xmm6,[r11-10h]
+  movaps      xmm8,[r11-30h]
+  movaps      xmm9,[r11-40h]
+  movq        [rbx],xmm1
+  movaps      xmm10,[r11-50h]
+  movaps      xmm11,[r11-60h]
+  movaps      xmm12,[r11-70h]
+  movaps      xmm13,[r11-80h]
+  mov         rsp,r11
+  POP_XMM
+  pop         rbx
+  ret
+
+
+
+
+
+WELS_EXTERN   DeblockChromaEq4H_ssse3
+  mov         rax,rsp
+  mov         [rax+20h],rbx
+  push        rdi
+  PUSH_XMM 16
+  sub         rsp,140h
+  mov         rdi,rdx
+  lea         eax,[r8*4]
+  movsxd      r10,eax
+  mov         eax,[rcx-2]
+  mov         [rsp+10h],eax
+  lea         rbx,[r10+rdx-2]
+  lea         r11,[r10+rcx-2]
+  movdqa      xmm5,[rsp+10h]
+  movsxd      r10,r8d
+  mov         eax,[r10+rcx-2]
+  lea         rdx,[r10+r10*2]
+  mov         [rsp+20h],eax
+  mov         eax,[rcx+r10*2-2]
+  mov         [rsp+30h],eax
+  mov         eax,[rdx+rcx-2]
+  movdqa      xmm2,[rsp+20h]
+  mov         [rsp+40h],eax
+  mov         eax, [rdi-2]
+  movdqa      xmm4,[rsp+30h]
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rdi-2]
+  movdqa      xmm3,[rsp+40h]
+  mov         [rsp+60h],eax
+  mov         eax,[rdi+r10*2-2]
+  punpckldq   xmm5,[rsp+50h]
+  mov         [rsp+70h],eax
+  mov         eax, [rdx+rdi-2]
+  punpckldq   xmm2, [rsp+60h]
+  mov          [rsp+80h],eax
+  mov         eax,[r11]
+  punpckldq   xmm4, [rsp+70h]
+  mov         [rsp+50h],eax
+  mov         eax,[rbx]
+  punpckldq   xmm3,[rsp+80h]
+  mov         [rsp+60h],eax
+  mov         eax,[r10+r11]
+  movdqa      xmm0, [rsp+50h]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm0,[rsp+50h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+60h],eax
+  mov         eax,[r11+r10*2]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[rbx+r10*2]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  mov         eax, [rdx+r11]
+  movdqa      xmm15,xmm1
+  punpckldq   xmm0,[rsp+60h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax, [rdx+rbx]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm15,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm12,xmm15
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm12,xmm0
+  punpckhdq   xmm15,xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm11,xmm12
+  punpckldq   xmm0,xmm5
+  punpckhdq   xmm1,xmm5
+  punpcklqdq  xmm11,xmm0
+  punpckhqdq  xmm12,xmm0
+  movsx       eax,r9w
+  movdqa      xmm14,xmm15
+  punpcklqdq  xmm14,xmm1
+  punpckhqdq  xmm15,xmm1
+  pxor        xmm1,xmm1
+  movd        xmm0,eax
+  movdqa      xmm4,xmm12
+  movdqa      xmm8,xmm11
+  movsx       eax,word [rsp+170h + 160] ; iBeta
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm4,xmm1
+  punpckhbw   xmm12,xmm1
+  movdqa      xmm9,xmm14
+  movdqa      xmm7,xmm15
+  movdqa      xmm10,xmm15
+  pshufd      xmm13,xmm0,0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm14,xmm1
+  movdqa      xmm6,xmm13
+  movd        xmm0,eax
+  movdqa      [rsp],xmm11
+  mov         eax,2
+  cwde
+  punpckhbw   xmm11,xmm1
+  punpckhbw   xmm10,xmm1
+  punpcklbw   xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm8,xmm1
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm4
+  psubw       xmm0,xmm9
+  psubw       xmm1,xmm4
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm12
+  movdqa      xmm1,xmm11
+  psubw       xmm0,xmm14
+  psubw       xmm1,xmm12
+  movdqa      xmm5,xmm6
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm13,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm2,xmm0
+  paddw       xmm1,xmm8
+  movdqa      xmm0,xmm10
+  pand        xmm13,xmm2
+  psubw       xmm0,xmm14
+  paddw       xmm1,xmm4
+  movdqa      xmm2,xmm11
+  pabsw       xmm0,xmm0
+  paddw       xmm2,xmm11
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm12
+  movd        xmm0,eax
+  pand        xmm13,xmm3
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm4
+  paddw       xmm2,xmm3
+  psraw       xmm1,2
+  pand        xmm5,xmm1
+  por         xmm5,xmm0
+  paddw       xmm7,xmm7
+  paddw       xmm10,xmm10
+  psraw       xmm2,2
+  movdqa      xmm1,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm2
+  paddw       xmm7,xmm9
+  por         xmm1,xmm0
+  paddw       xmm10,xmm14
+  paddw       xmm7,xmm8
+  movdqa      xmm0,xmm13
+  packuswb    xmm5,xmm1
+  paddw       xmm7,xmm3
+  paddw       xmm10,xmm11
+  movdqa      xmm1,xmm6
+  paddw       xmm10,xmm3
+  pandn       xmm6,xmm9
+  psraw       xmm7,2
+  pand        xmm1,xmm7
+  psraw       xmm10,2
+  pandn       xmm13,xmm14
+  pand        xmm0,xmm10
+  por         xmm1,xmm6
+  movdqa      xmm6,[rsp]
+  movdqa      xmm4,xmm6
+  por         xmm0,xmm13
+  punpcklbw   xmm4,xmm5
+  punpckhbw   xmm6,xmm5
+  movdqa      xmm3,xmm4
+  packuswb    xmm1,xmm0
+  movdqa      xmm0,xmm1
+  punpckhbw   xmm1,xmm15
+  punpcklbw   xmm0,xmm15
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm6
+  movdqa      xmm2,xmm3
+  punpcklwd   xmm0,xmm1
+  punpckhwd   xmm6,xmm1
+  movdqa      xmm1,xmm4
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm6
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm6
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+10h],xmm0
+  movdqa      [rsp+60h],xmm2
+  movdqa      xmm0,xmm3
+  mov         eax,[rsp+10h]
+  mov         [rcx-2],eax
+  mov         eax,[rsp+60h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [r10+rcx-2],eax
+  movdqa      [rsp+20h],xmm0
+  mov         eax, [rsp+20h]
+  movdqa      [rsp+70h],xmm3
+  mov         [rcx+r10*2-2],eax
+  mov         eax,[rsp+70h]
+  mov         [rdx+rcx-2],eax
+  mov         eax,[rsp+18h]
+  mov         [r11],eax
+  mov         eax,[rsp+68h]
+  mov         [r10+r11],eax
+  mov         eax,[rsp+28h]
+  mov         [r11+r10*2],eax
+  mov         eax,[rsp+78h]
+  mov         [rdx+r11],eax
+  mov         eax,[rsp+14h]
+  mov         [rdi-2],eax
+  mov         eax,[rsp+64h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+24h]
+  mov         [rdi+r10*2-2],eax
+  mov         eax, [rsp+74h]
+  mov         [rdx+rdi-2],eax
+  mov         eax, [rsp+1Ch]
+  mov         [rbx],eax
+  mov         eax, [rsp+6Ch]
+  mov         [r10+rbx],eax
+  mov         eax,[rsp+2Ch]
+  mov         [rbx+r10*2],eax
+  mov         eax,[rsp+7Ch]
+  mov         [rdx+rbx],eax
+  lea         rsp,[rsp+140h]
+  POP_XMM
+  mov         rbx, [rsp+28h]
+  pop         rdi
+  ret
+
+
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        rsi
+  push        rdi
+  push        r12
+  PUSH_XMM 16
+  sub         rsp,170h
+
+  movsxd      rsi,r8d
+  lea         eax,[r8*4]
+  mov         r11d,r9d
+  movsxd      r10,eax
+  mov         eax, [rcx-2]
+  mov         r12,rdx
+  mov         [rsp+40h],eax
+  mov         eax, [rsi+rcx-2]
+  lea         rbx,[r10+rcx-2]
+  movdqa      xmm5,[rsp+40h]
+  mov         [rsp+50h],eax
+  mov         eax, [rcx+rsi*2-2]
+  lea         rbp,[r10+rdx-2]
+  movdqa      xmm2, [rsp+50h]
+  mov         [rsp+60h],eax
+  lea         r10,[rsi+rsi*2]
+  mov         rdi,rcx
+  mov         eax,[r10+rcx-2]
+  movdqa      xmm4,[rsp+60h]
+  mov         [rsp+70h],eax
+  mov         eax,[rdx-2]
+  mov         [rsp+80h],eax
+  mov         eax, [rsi+rdx-2]
+  movdqa      xmm3,[rsp+70h]
+  mov         [rsp+90h],eax
+  mov         eax,[rdx+rsi*2-2]
+  punpckldq   xmm5,[rsp+80h]
+  mov         [rsp+0A0h],eax
+  mov         eax, [r10+rdx-2]
+  punpckldq   xmm2,[rsp+90h]
+  mov         [rsp+0B0h],eax
+  mov         eax, [rbx]
+  punpckldq   xmm4,[rsp+0A0h]
+  mov         [rsp+80h],eax
+  mov         eax,[rbp]
+  punpckldq   xmm3,[rsp+0B0h]
+  mov         [rsp+90h],eax
+  mov         eax,[rsi+rbx]
+  movdqa      xmm0,[rsp+80h]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rsi+rbp]
+  movdqa      xmm0,[rsp+80h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+90h],eax
+  mov         eax,[rbx+rsi*2]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rbp+rsi*2]
+  movdqa      xmm0, [rsp+80h]
+  mov         [rsp+90h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm7,xmm1
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax, [r10+rbp]
+  movdqa      xmm0,[rsp+80h]
+  mov         [rsp+90h],eax
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm7,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm6,xmm7
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm6,xmm0
+  punpckhdq   xmm7,xmm0
+  movdqa      xmm0,xmm1
+  punpckldq   xmm0,xmm5
+  mov         rax, [rsp+1C8h+160]    ; pTC
+  punpckhdq   xmm1,xmm5
+  movdqa      xmm9,xmm6
+  punpckhqdq  xmm6,xmm0
+  punpcklqdq  xmm9,xmm0
+  movdqa      xmm2,xmm7
+  movdqa      xmm13,xmm6
+  movdqa      xmm4,xmm9
+  movdqa      [rsp+10h],xmm9
+  punpcklqdq  xmm2,xmm1
+  punpckhqdq  xmm7,xmm1
+  pxor        xmm1,xmm1
+  movsx       ecx,byte [rax+3]
+  movsx       edx,byte [rax+2]
+  movsx       r8d,byte [rax+1]
+  movsx       r9d,byte [rax]
+  movdqa      xmm10,xmm1
+  movdqa      xmm15,xmm2
+  punpckhbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm4,xmm1
+  movsx       eax,r11w
+  mov         word [rsp+0Eh],cx
+  mov         word [rsp+0Ch],cx
+  movdqa      xmm3,xmm7
+  movdqa      xmm8,xmm7
+  movdqa      [rsp+20h],xmm7
+  punpcklbw   xmm15,xmm1
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm3,xmm1
+  mov         word [rsp+0Ah],dx
+  mov         word [rsp+8],dx
+  mov         word [rsp+6],r8w
+  movd        xmm0,eax
+  movdqa      [rsp+30h],xmm6
+  punpckhbw   xmm9,xmm1
+  punpckhbw   xmm8,xmm1
+  punpcklwd   xmm0,xmm0
+  movsx       eax,word [rsp+1C0h+160]   ; iBeta
+  mov         word [rsp+4],r8w
+  mov         word [rsp+2],r9w
+  pshufd      xmm12,xmm0,0
+  mov         word [rsp],r9w
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  movdqa      xmm14, [rsp]
+  movdqa      [rsp],xmm2
+  movdqa      xmm2,xmm12
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  psubw       xmm10,xmm14
+  movd        xmm0,eax
+  movdqa      xmm7,xmm14
+  movdqa      xmm6,xmm14
+  pcmpgtw     xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  movdqa      xmm0,xmm4
+  movdqa      xmm1,xmm15
+  psubw       xmm4,xmm13
+  psubw       xmm0,xmm3
+  psubw       xmm1,xmm13
+  psubw       xmm3,xmm15
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm10
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm11
+  movdqa      xmm0,xmm13
+  psubw       xmm0,xmm15
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm11
+  movdqa      xmm3,[rsp+30h]
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm9
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm8
+  psubw       xmm9,xmm3
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  psubw       xmm15,xmm6
+  paddw       xmm13,xmm6
+  movdqa      xmm2,[rsp]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  psubw       xmm8,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm3
+  movdqa      xmm5,[rsp+10h]
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  movdqa      xmm4,xmm5
+  pabsw       xmm0,xmm0
+  pmaxsw      xmm10,xmm1
+  movdqa      xmm1,xmm11
+  pcmpgtw     xmm12,xmm0
+  pabsw       xmm0,xmm9
+  pminsw      xmm14,xmm10
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm8
+  pcmpgtw     xmm11,xmm0
+  pand        xmm12,xmm1
+  movdqa      xmm1,[rsp+20h]
+  pand        xmm12,xmm11
+  pand        xmm12,xmm7
+  pand        xmm14,xmm12
+  paddw       xmm3,xmm14
+  psubw       xmm2,xmm14
+  packuswb    xmm13,xmm3
+  packuswb    xmm15,xmm2
+  punpcklbw   xmm4,xmm13
+  punpckhbw   xmm5,xmm13
+  movdqa      xmm0,xmm15
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm4
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm2,xmm3
+  movdqa      xmm1,xmm4
+  punpcklwd   xmm0,xmm15
+  punpckhwd   xmm5,xmm15
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm5
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm5
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+40h],xmm0
+  movdqa      xmm0,xmm3
+  movdqa      [rsp+90h],xmm2
+  mov         eax,[rsp+40h]
+  mov         [rdi-2],eax
+  mov         eax, [rsp+90h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [rsi+rdi-2],eax
+  movdqa      [rsp+50h],xmm0
+  mov         eax,[rsp+50h]
+  movdqa      [rsp+0A0h],xmm3
+  mov         [rdi+rsi*2-2],eax
+  mov         eax,[rsp+0A0h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+48h]
+  mov         [rbx],eax
+  mov         eax,[rsp+98h]
+  mov         [rsi+rbx],eax
+  mov         eax,[rsp+58h]
+  mov         [rbx+rsi*2],eax
+  mov         eax, [rsp+0A8h]
+  mov         [r10+rbx],eax
+  mov         eax, [rsp+44h]
+  mov         [r12-2],eax
+  mov         eax,[rsp+94h]
+  mov         [rsi+r12-2],eax
+  mov         eax,[rsp+54h]
+  mov         [r12+rsi*2-2],eax
+  mov         eax, [rsp+0A4h]
+  mov         [r10+r12-2],eax
+  mov         eax,[rsp+4Ch]
+  mov         [rbp],eax
+  mov         eax,[rsp+9Ch]
+  mov         [rsi+rbp],eax
+  mov         eax, [rsp+5Ch]
+  mov         [rbp+rsi*2],eax
+  mov         eax,[rsp+0ACh]
+  mov         [r10+rbp],eax
+  lea         r11,[rsp+170h]
+  mov         rsp,r11
+  POP_XMM
+  pop         r12
+  pop         rdi
+  pop         rsi
+  pop         rbp
+  pop         rbx
+  ret
+
+
+
+%elifdef  UNIX64
+
+
+WELS_EXTERN   DeblockLumaLt4V_ssse3
+  push        rbp
+  mov         r11,r8  ; pTC
+  sub         rsp,1B0h
+  lea         rbp,[rsp+20h]
+  movd        xmm4,edx
+  movd        xmm2,ecx
+  mov         qword [rbp+180h],r12
+  mov         r10,rdi
+  movsxd      r12,esi
+  add         rsi,rsi
+  movsxd      rdx,esi
+  sub         r10,r12
+  movsx       r8d,byte [r11]
+  pxor        xmm3,xmm3
+  punpcklwd   xmm2,xmm2
+  movaps      [rbp+50h],xmm14
+  lea         rax,[r12+r12*2]
+  movdqa      xmm14,[rdx+rdi]
+  neg         rax
+  pshufd      xmm0,xmm2,0
+  movd        xmm2,r8d
+  movsx       rsi,byte [r11+1]
+  movsx       r8d,byte [r11+2]
+  movsx       r11d,byte [r11+3]
+  movaps      [rbp+70h],xmm12
+  movd        xmm1,esi
+  movaps      [rbp+80h],xmm11
+  movd        xmm12,r8d
+  movd        xmm11,r11d
+  movdqa      xmm5, [rax+rdi]
+  lea         rax,[r12+r12]
+  punpcklwd   xmm12,xmm12
+  neg         rax
+  punpcklwd   xmm11,xmm11
+  movaps      [rbp],xmm8
+  movdqa      xmm8, [r10]
+  punpcklwd   xmm2,xmm2
+  punpcklwd   xmm1,xmm1
+  punpcklqdq  xmm12,xmm12
+  punpcklqdq  xmm11,xmm11
+  punpcklqdq  xmm2,xmm2
+  punpcklqdq  xmm1,xmm1
+  shufps      xmm12,xmm11,88h
+  movdqa      xmm11,xmm8
+  movaps      [rbp+30h],xmm9
+  movdqa      xmm9,[rdi]
+  shufps      xmm2,xmm1,88h
+  movdqa      xmm1,xmm5
+  punpcklbw   xmm11,xmm3
+  movaps      [rbp+20h],xmm6
+  movaps      [rbp+60h],xmm13
+  movdqa      xmm13,xmm11
+  movaps      [rbp+90h],xmm10
+  movdqa      xmm10,xmm9
+  movdqa      xmm6,[rax+rdi]
+  punpcklbw   xmm1,xmm3
+  movaps      [rbp+0A0h],xmm12
+  psubw       xmm13,xmm1
+  movaps      [rbp+40h],xmm15
+  movdqa      xmm15,xmm14
+  movaps      [rbp+10h],xmm7
+  movdqa      xmm7,xmm6
+  punpcklbw   xmm10,xmm3
+  movdqa      xmm12,[r12+rdi]
+  punpcklbw   xmm7,xmm3
+  punpcklbw   xmm12,xmm3
+  punpcklbw   xmm15,xmm3
+  pabsw       xmm3,xmm13
+  movdqa      xmm13,xmm10
+  psubw       xmm13,xmm15
+  movdqa      [rbp+0F0h],xmm15
+  pabsw       xmm15,xmm13
+  movdqa      xmm13,xmm11
+  movdqa      [rbp+0B0h],xmm1
+  movdqa      xmm1,xmm0
+  pavgw       xmm13,xmm10
+  pcmpgtw     xmm1,xmm3
+  movdqa      [rbp+120h],xmm13
+  movaps      xmm13,xmm2
+  punpcklwd   xmm4,xmm4
+  movdqa      xmm3,xmm0
+  movdqa      [rbp+100h],xmm1
+  psubw       xmm13,xmm1
+  movdqa      xmm1,xmm10
+  pcmpgtw     xmm3,xmm15
+  pshufd      xmm4,xmm4,0
+  psubw       xmm1,xmm11
+  movdqa      [rbp+0D0h],xmm10
+  psubw       xmm13,xmm3
+  movdqa      [rbp+110h],xmm3
+  pabsw       xmm15,xmm1
+  movdqa      xmm3,xmm4
+  psubw       xmm10,xmm12
+  pcmpgtw     xmm3,xmm15
+  pabsw       xmm15,xmm10
+  movdqa      xmm10,xmm0
+  psllw       xmm1,2
+  movdqa      [rbp+0C0h],xmm11
+  psubw       xmm11,xmm7
+  pcmpgtw     xmm10,xmm15
+  pabsw       xmm11,xmm11
+  movdqa      xmm15,xmm0
+  pand        xmm3,xmm10
+  pcmpgtw     xmm15,xmm11
+  movaps      xmm11,xmm2
+  pxor        xmm10,xmm10
+  pand        xmm3,xmm15
+  pcmpgtw     xmm11,xmm10
+  pcmpeqw     xmm10,xmm2
+  por         xmm11,xmm10
+  pand        xmm3,xmm11
+  movdqa      xmm11,xmm7
+  psubw       xmm11,xmm12
+  pxor        xmm15,xmm15
+  paddw       xmm11,xmm1
+  psubw       xmm15,xmm13
+  movdqa      [rbp+0E0h],xmm12
+  paddw       xmm11,[FOUR_16B_SSE2]
+  pxor        xmm12,xmm12
+  psraw       xmm11,3
+  punpckhbw   xmm8,xmm12
+  pmaxsw      xmm15,xmm11
+  punpckhbw   xmm5,xmm12
+  movdqa      xmm11,xmm8
+  pminsw      xmm13,xmm15
+  psubw       xmm11,xmm5
+  punpckhbw   xmm9,xmm12
+  pand        xmm13,xmm3
+  movdqa      [rbp+130h],xmm13
+  pabsw       xmm13,xmm11
+  punpckhbw   xmm14,xmm12
+  movdqa      xmm11,xmm9
+  psubw       xmm11,xmm14
+  movdqa      xmm15,xmm0
+  movdqa      [rbp+140h],xmm14
+  pabsw       xmm14,xmm11
+  movdqa      xmm11,xmm8
+  pcmpgtw     xmm15,xmm14
+  movdqa      xmm1,[r12+rdi]
+  pavgw       xmm11,xmm9
+  movdqa      [rbp+170h],xmm11
+  movdqa      xmm10,xmm9
+  punpckhbw   xmm6,xmm12
+  psubw       xmm10,xmm8
+  punpckhbw   xmm1,xmm12
+  movdqa      xmm12,xmm0
+  movaps      xmm11,[rbp+0A0h]
+  pcmpgtw     xmm12,xmm13
+  movaps      xmm13,xmm11
+  psubw       xmm13,xmm12
+  movdqa      [rbp+160h],xmm15
+  psubw       xmm13,xmm15
+  movdqa      xmm15,xmm9
+  psubw       xmm15,xmm1
+  movdqa      [rbp+150h],xmm12
+  pabsw       xmm12,xmm10
+  pabsw       xmm14,xmm15
+  movdqa      xmm15,xmm8
+  pcmpgtw     xmm4,xmm12
+  movdqa      xmm12,xmm0
+  psubw       xmm15,xmm6
+  pcmpgtw     xmm12,xmm14
+  pabsw       xmm14,xmm15
+  psllw       xmm10,2
+  pcmpgtw     xmm0,xmm14
+  movdqa      xmm14,xmm6
+  psubw       xmm14,xmm1
+  pand        xmm4,xmm12
+  paddw       xmm14,xmm10
+  pand        xmm4,xmm0
+  paddw       xmm14,[FOUR_16B_SSE2]
+  pxor        xmm15,xmm15
+  movaps      xmm12,xmm11
+  psubw       xmm15,xmm13
+  pxor        xmm0,xmm0
+  psraw       xmm14,3
+  pcmpgtw     xmm12,xmm0
+  pcmpeqw     xmm0,xmm11
+  pmaxsw      xmm15,xmm14
+  por         xmm12,xmm0
+  movdqa      xmm0,[rbp+120h]
+  pminsw      xmm13,xmm15
+  movdqa      xmm15,[rbp+0B0h]
+  movdqa      xmm10,xmm7
+  pand        xmm4,xmm12
+  paddw       xmm15,xmm0
+  pxor        xmm12,xmm12
+  paddw       xmm10,xmm7
+  movdqa      xmm14,xmm12
+  psubw       xmm15,xmm10
+  psubw       xmm14,xmm2
+  psraw       xmm15,1
+  pmaxsw      xmm15,xmm14
+  movdqa      xmm10,xmm6
+  pminsw      xmm15,xmm2
+  paddw       xmm10,xmm6
+  pand        xmm15,xmm3
+  psubw       xmm12,xmm11
+  pand        xmm15,[rbp+100h]
+  pand        xmm13,xmm4
+  paddw       xmm7,xmm15
+  paddw       xmm8,xmm13
+  movdqa      xmm15,[rbp+170h]
+  psubw       xmm9,xmm13
+  paddw       xmm5,xmm15
+  psubw       xmm5,xmm10
+  psraw       xmm5,1
+  pmaxsw      xmm5,xmm12
+  pminsw      xmm5,xmm11
+  pand        xmm5,xmm4
+  pand        xmm5,[rbp+150h]
+  paddw       xmm6,xmm5
+  movdqa      xmm5,[rbp+0C0h]
+  packuswb    xmm7,xmm6
+  movdqa      xmm6,[rbp+130h]
+  paddw       xmm5,xmm6
+  packuswb    xmm5,xmm8
+  movdqa      xmm8,[rbp+0D0h]
+  psubw       xmm8,xmm6
+  movdqa      xmm6,[rbp+0F0h]
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[rbp+0E0h]
+  packuswb    xmm8,xmm9
+  movdqa      xmm9,xmm0
+  paddw       xmm9,xmm0
+  psubw       xmm6,xmm9
+  psraw       xmm6,1
+  pmaxsw      xmm14,xmm6
+  pminsw      xmm2,xmm14
+  pand        xmm2,xmm3
+  pand        xmm2,[rbp+110h]
+  paddw       xmm0,xmm2
+  movdqa      xmm2,[rbp+140h]
+  paddw       xmm2,xmm15
+  movdqa      xmm15,xmm1
+  paddw       xmm15,xmm1
+  psubw       xmm2,xmm15
+  psraw       xmm2,1
+  pmaxsw      xmm12,xmm2
+  pminsw      xmm11,xmm12
+  pand        xmm11,xmm4
+  pand        xmm11,[rbp+160h]
+  paddw       xmm1,xmm11
+  movdqa      [rax+rdi],xmm7
+  movdqa      [r10],xmm5
+  packuswb    xmm0,xmm1
+  movdqa      [rdi],xmm8
+  movdqa      [r12+rdi],xmm0
+  mov         r12,qword [rbp+180h]
+  lea         rsp,[rbp+190h]
+  pop         rbp
+  ret
+
+
+WELS_EXTERN DeblockLumaEq4V_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  mov         r8,   rdx
+  mov         r9,   rcx
+  mov         rcx,  rdi
+  mov         rdx,  rsi
+  sub         rsp,1D8h
+  movaps      [rax-38h],xmm6
+  movaps      [rax-48h],xmm7
+  movaps      [rax-58h],xmm8
+  pxor        xmm1,xmm1
+  movsxd      r10,edx
+  mov         rbp,rcx
+  mov         r11d,r8d
+  mov         rdx,rcx
+  mov         rdi,rbp
+  mov         rbx,rbp
+  movdqa      xmm5,[rbp]
+  movaps      [rax-68h],xmm9
+  movaps      [rax-78h],xmm10
+  punpcklbw   xmm5,xmm1
+  movaps      [rax-88h],xmm11
+  movaps      [rax-98h],xmm12
+  movaps      [rax-0A8h],xmm13
+  movaps      [rax-0B8h],xmm14
+  movdqa      xmm14,[r10+rbp]
+  movaps      [rax-0C8h],xmm15
+  lea         eax,[r10*4]
+  movsxd      r8,eax
+  lea         eax,[r10+r10*2]
+  movsxd      rcx,eax
+  lea         eax,[r10+r10]
+  sub         rdx,r8
+  punpcklbw   xmm14,xmm1
+  movdqa      [rsp+90h],xmm5
+  movdqa      [rsp+30h],xmm14
+  movsxd      rsi,eax
+  movsx       eax,r11w
+  sub         rdi,rcx
+  sub         rbx,rsi
+  mov         r8,rbp
+  sub         r8,r10
+  movd        xmm0,eax
+  movsx       eax,r9w
+  movdqa      xmm12,[rdi]
+  movdqa      xmm6, [rsi+rbp]
+  movdqa      xmm13,[rbx]
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm6,xmm1
+  movdqa      xmm8,[r8]
+  movd        xmm0,eax
+  movdqa      xmm10,xmm11
+  mov         eax,2
+  punpcklbw   xmm8,xmm1
+  punpcklbw   xmm12,xmm1
+  cwde
+  punpcklwd   xmm0,xmm0
+  psraw       xmm10,2
+  movdqa      xmm1,xmm8
+  movdqa      [rsp+0F0h],xmm13
+  movdqa      [rsp+0B0h],xmm8
+  pshufd      xmm7,xmm0,0
+  psubw       xmm1,xmm13
+  movdqa      xmm0,xmm5
+  movdqa      xmm4,xmm7
+  movdqa      xmm2,xmm7
+  psubw       xmm0,xmm8
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm5
+  movdqa      [rsp+40h],xmm7
+  movdqa      [rsp+60h],xmm6
+  pcmpgtw     xmm4,xmm0
+  psubw       xmm1,xmm14
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm4,xmm2
+  movdqa      xmm0,xmm11
+  pcmpgtw     xmm0,xmm3
+  pand        xmm4,xmm0
+  movd        xmm0,eax
+  movdqa      [rsp+20h],xmm4
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm2,xmm0,0
+  paddw       xmm10,xmm2
+  movdqa      [rsp+0A0h],xmm2
+  movdqa      xmm15,xmm7
+  pxor        xmm4,xmm4
+  movdqa      xmm0,xmm8
+  psubw       xmm0,xmm12
+  mov         eax,4
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm10
+  cwde
+  pcmpgtw     xmm15,xmm0
+  pcmpgtw     xmm1,xmm3
+  movdqa      xmm3,xmm7
+  movdqa      xmm7,[rdx]
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm6
+  pand        xmm15,xmm1
+  punpcklbw   xmm7,xmm4
+  movdqa      xmm9,xmm15
+  pabsw       xmm0,xmm0
+  psllw       xmm7,1
+  pandn       xmm9,xmm12
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm7,xmm12
+  movd        xmm0,eax
+  pand        xmm3,xmm1
+  paddw       xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  paddw       xmm7,xmm12
+  pshufd      xmm1,xmm0,0
+  paddw       xmm7,xmm13
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm6
+  paddw       xmm7,xmm8
+  movdqa      [rsp+70h],xmm1
+  paddw       xmm7,xmm5
+  movdqa      [rsp+120h],xmm0
+  movdqa      xmm0,[rcx+rbp]
+  punpcklbw   xmm0,xmm4
+  paddw       xmm7,xmm1
+  movdqa      xmm4,xmm15
+  psllw       xmm0,1
+  psraw       xmm7,3
+  paddw       xmm0,xmm6
+  pand        xmm7,xmm15
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm14
+  movdqa      xmm6,xmm15
+  paddw       xmm0,xmm5
+  pandn       xmm6,xmm13
+  paddw       xmm0,xmm8
+  paddw       xmm0,xmm1
+  psraw       xmm0,3
+  movdqa      xmm1,xmm12
+  paddw       xmm1,xmm13
+  pand        xmm0,xmm3
+  movdqa      [rsp+100h],xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,xmm5
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm3
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pandn       xmm0,xmm14
+  pand        xmm4,xmm1
+  movdqa      [rsp+0E0h],xmm0
+  movdqa      xmm0,xmm5
+  paddw       xmm0,xmm8
+  movdqa      xmm1,[rsp+60h]
+  paddw       xmm1,xmm14
+  movdqa      xmm14,xmm3
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,[rsp+30h]
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pand        xmm14,xmm1
+  movdqa      xmm1,xmm13
+  paddw       xmm1,xmm13
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  movdqa      xmm0,[rsp+30h]
+  movdqa      xmm2,xmm13
+  movdqa      xmm5,xmm15
+  paddw       xmm0,[rsp+70h]
+  pandn       xmm5,xmm1
+  paddw       xmm2,xmm8
+  movdqa      xmm8,[rsp+90h]
+  movdqa      xmm1,xmm12
+  paddw       xmm2,xmm8
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,xmm8
+  movdqa      xmm8,xmm3
+  movdqa      xmm2,[rsp+30h]
+  paddw       xmm0,xmm13
+  psraw       xmm1,3
+  pand        xmm15,xmm1
+  movdqa      xmm1,xmm2
+  paddw       xmm1,xmm2
+  paddw       xmm2,[rsp+90h]
+  paddw       xmm2,[rsp+0B0h]
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  movdqa      xmm13,[r8]
+  paddw       xmm0, [rsp+70h]
+  paddw       xmm1, [rsp+0A0h]
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  psraw       xmm1,2
+  movdqa      xmm0, [rdi]
+  pandn       xmm8,xmm1
+  movdqa      xmm1, [rsp+60h]
+  paddw       xmm1,xmm2
+  movdqa      xmm2, [rbx]
+  psraw       xmm1,3
+  pand        xmm3,xmm1
+  movdqa      xmm1, [rbp]
+  movdqa      [rsp+0D0h],xmm3
+  pxor        xmm3,xmm3
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm1,xmm3
+  punpckhbw   xmm13,xmm3
+  movdqa      [rsp+0C0h],xmm0
+  movdqa      xmm0,[r10+rbp]
+  movdqa      [rsp],xmm1
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm2,xmm3
+  movdqa      [rsp+80h],xmm0
+  movdqa      xmm0,[rsi+rbp]
+  movdqa      [rsp+10h],xmm13
+  punpckhbw   xmm0,xmm3
+  movdqa      [rsp+50h],xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm1,xmm13
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm2
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,[rsp]
+  movdqa      xmm13,[rsp+40h]
+  movdqa      [rsp+110h],xmm2
+  psubw       xmm1, [rsp+80h]
+  pcmpgtw     xmm13,xmm0
+  pcmpgtw     xmm11,xmm3
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm10,xmm3
+  movdqa      xmm1, [rsp+40h]
+  movdqa      xmm2,xmm1
+  movdqa      xmm3,xmm1
+  pcmpgtw     xmm2,xmm0
+  movdqa      xmm0, [rsp+10h]
+  pand        xmm13,xmm2
+  pand        xmm13,xmm11
+  movdqa      xmm11,[rsp+0C0h]
+  psubw       xmm0,xmm11
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm3,xmm0
+  pand        xmm3,xmm10
+  movdqa      xmm0,[rsp]
+  psubw       xmm0,[rsp+50h]
+  movdqa      xmm2,[rdx]
+  pabsw       xmm0,xmm0
+  por         xmm7,xmm9
+  movdqa      xmm9,[rsp+20h]
+  pcmpgtw     xmm1,xmm0
+  pand        xmm9,xmm7
+  movdqa      xmm7,[rsp+20h]
+  movdqa      xmm0,xmm7
+  pandn       xmm0,xmm12
+  movdqa      xmm12,[rsp+110h]
+  pand        xmm1,xmm10
+  movdqa      xmm10,[rsp+70h]
+  movdqa      [rsp+40h],xmm1
+  movdqa      xmm1,xmm13
+  por         xmm9,xmm0
+  pxor        xmm0,xmm0
+  por         xmm4,xmm6
+  movdqa      xmm6,xmm7
+  punpckhbw   xmm2,xmm0
+  por         xmm15,xmm5
+  movdqa      xmm5,[rsp+20h]
+  movdqa      xmm0,xmm3
+  psllw       xmm2,1
+  pandn       xmm0,xmm11
+  pand        xmm6,xmm4
+  movdqa      xmm4,[rsp]
+  paddw       xmm2,xmm11
+  pand        xmm5,xmm15
+  movdqa      xmm15,[rsp+20h]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm12
+  paddw       xmm2,[rsp+10h]
+  paddw       xmm2,[rsp]
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  pand        xmm2,xmm3
+  por         xmm2,xmm0
+  pand        xmm1,xmm2
+  movdqa      xmm0,xmm13
+  movdqa      xmm2,xmm11
+  pandn       xmm0,xmm11
+  paddw       xmm2,xmm12
+  por         xmm1,xmm0
+  packuswb    xmm9,xmm1
+  movdqa      xmm0,xmm7
+  movdqa      xmm7,[rsp+0A0h]
+  pandn       xmm0,[rsp+0F0h]
+  movdqa      xmm1,xmm3
+  por         xmm6,xmm0
+  movdqa      xmm0,[rsp+10h]
+  paddw       xmm0,xmm4
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm12
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  pandn       xmm0,xmm12
+  movdqa      xmm1,xmm12
+  paddw       xmm1,[rsp+10h]
+  por         xmm2,xmm0
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+0B0h]
+  paddw       xmm1,xmm4
+  packuswb    xmm6,xmm2
+  movdqa      xmm2,xmm3
+  psllw       xmm1,1
+  por         xmm5,xmm0
+  movdqa      xmm0,[rsp+80h]
+  paddw       xmm0,xmm10
+  paddw       xmm1,xmm0
+  paddw       xmm11,xmm1
+  psraw       xmm11,3
+  movdqa      xmm1,xmm12
+  pand        xmm2,xmm11
+  paddw       xmm1,xmm12
+  movdqa      xmm11,[rsp+80h]
+  movdqa      xmm0, [rsp+10h]
+  por         xmm14,[rsp+0E0h]
+  paddw       xmm0,xmm11
+  movdqa      xmm4,xmm15
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  paddw       xmm1,xmm7
+  psraw       xmm1,2
+  pandn       xmm3,xmm1
+  por         xmm2,xmm3
+  movdqa      xmm1,xmm13
+  movdqa      xmm3,[rsp+10h]
+  pandn       xmm0,xmm3
+  pand        xmm1,xmm2
+  movdqa      xmm2,xmm11
+  paddw       xmm2,[rsp]
+  por         xmm1,xmm0
+  movdqa      xmm0,[rsp+0D0h]
+  por         xmm0,xmm8
+  paddw       xmm2,xmm3
+  packuswb    xmm5,xmm1
+  movdqa      xmm8,[rsp+40h]
+  movdqa      xmm1,[rsp+50h]
+  movdqa      xmm3,xmm8
+  pand        xmm4,xmm0
+  psllw       xmm2,1
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+90h]
+  por         xmm4,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm10
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,[rsp]
+  movdqa      xmm2,xmm11
+  paddw       xmm0,xmm12
+  movdqa      xmm12,[rsp]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm0
+  psraw       xmm1,3
+  movdqa      xmm0,xmm8
+  pand        xmm3,xmm1
+  paddw       xmm2,xmm7
+  movdqa      xmm1,xmm13
+  psraw       xmm2,2
+  pandn       xmm0,xmm2
+  por         xmm3,xmm0
+  movdqa      xmm2,[rsp+50h]
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm3
+  paddw       xmm2,xmm11
+  movdqa      xmm3,xmm15
+  por         xmm1,xmm0
+  pand        xmm3,xmm14
+  movdqa      xmm14,[rsp+10h]
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+30h]
+  packuswb    xmm4,xmm1
+  movdqa      xmm1,xmm8
+  por         xmm3,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm14
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm8
+  pandn       xmm0,xmm11
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm11
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm15
+  por         xmm2,xmm0
+  packuswb    xmm3,xmm2
+  movdqa      xmm0,[rsp+100h]
+  por         xmm0,[rsp+120h]
+  pand        xmm1,xmm0
+  movdqa      xmm2,[rcx+rbp]
+  movdqa      xmm7,[rsp+50h]
+  pandn       xmm15,[rsp+60h]
+  lea         r11,[rsp+1D8h]
+  pxor        xmm0,xmm0
+  por         xmm1,xmm15
+  movaps      xmm15,[r11-0A8h]
+  movdqa      [rdi],xmm9
+  movaps      xmm9,[r11-48h]
+  punpckhbw   xmm2,xmm0
+  psllw       xmm2,1
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm7
+  movdqa      [rbx],xmm6
+  movaps      xmm6,[r11-18h]
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm11
+  movaps      xmm11,[r11-68h]
+  paddw       xmm2,xmm12
+  movaps      xmm12,[r11-78h]
+  paddw       xmm2,xmm14
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  movaps      xmm10,[r11-58h]
+  movaps      xmm14,[r11-98h]
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm8
+  pandn       xmm8,xmm7
+  pandn       xmm13,xmm7
+  por         xmm2,xmm8
+  movaps      xmm7,[r11-28h]
+  movaps      xmm8,[r11-38h]
+  movdqa      [r8],xmm5
+  pand        xmm0,xmm2
+  por         xmm0,xmm13
+  packuswb    xmm1,xmm0
+  movaps      xmm13,[r11-88h]
+  movdqa      [rbp],xmm4
+  movdqa      [r10+rbp],xmm3
+  movdqa      [rsi+rbp],xmm1
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
+  ret
+
+WELS_EXTERN  DeblockChromaLt4V_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  mov         r10,  rdx
+  mov         r11,  rcx
+  mov         rcx,  rdi
+  mov         rdx,  rsi
+  mov         rsi,  r10
+  mov         r10,  r9
+  mov         rbp,  r8
+  mov         r8,   rsi
+  mov         r9,   r11
+  sub         rsp,0C8h
+  pxor        xmm1,xmm1
+  mov         rbx,rcx
+  movsxd      r11,r8d
+  movsx       ecx,byte [r10]
+  movsx       r8d,byte [r10+2]
+  mov         rdi,rdx
+  movq        xmm2,[rbx]
+  movq        xmm9,[r11+rbx]
+  movsx       edx,byte [r10+1]
+  mov         word [rsp+2],cx
+  mov         word [rsp],cx
+  movsx       eax,byte [r10+3]
+  mov         word [rsp+6],dx
+  mov         word [rsp+4],dx
+  movdqa      xmm11,xmm1
+  mov         word [rsp+0Eh],ax
+  mov         word [rsp+0Ch],ax
+  lea         eax,[r11+r11]
+  movsxd      rcx,eax
+  mov         rax,rbx
+  mov         rdx,rdi
+  sub         rax,rcx
+  mov         word [rsp+0Ah],r8w
+  mov         word [rsp+8],r8w
+  movdqa      xmm6,[rsp]
+  movdqa      xmm7,xmm6
+  movq        xmm13, [rax]
+  mov         rax,rdi
+  sub         rax,rcx
+  mov         rcx,rbx
+  pcmpgtw     xmm7,xmm1
+  psubw       xmm11,xmm6
+  sub         rcx,r11
+  sub         rdx,r11
+  movq        xmm0,[rax]
+  movsx       eax,r9w
+  movq        xmm15,[rcx]
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rdx]
+  movdqa      xmm4,xmm13
+  punpcklqdq  xmm15,xmm0
+  movq        xmm0, [rdi]
+  punpcklbw   xmm4,xmm1
+  movdqa      xmm12,xmm15
+  punpcklqdq  xmm2,xmm0
+  movq        xmm0, [r11+rdi]
+  punpcklbw   xmm12,xmm1
+  movdqa      xmm14,xmm2
+  punpcklqdq  xmm9,xmm0
+  punpckhbw   xmm2,xmm1
+  punpcklbw   xmm14,xmm1
+  movd        xmm0,eax
+  mov         eax, ebp ; iBeta
+  punpckhbw   xmm13,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm9
+  movdqa      [rsp+10h],xmm2
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm9,xmm1
+  punpcklbw   xmm3,xmm1
+  movdqa      xmm1,xmm14
+  pshufd      xmm10,xmm0,0
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm8,xmm0,0
+  movd        xmm0,eax
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  psubw       xmm1,xmm12
+  movdqa      xmm2,xmm10
+  lea         r11,[rsp+0C8h]
+  psllw       xmm1,2
+  movdqa      xmm0,xmm4
+  psubw       xmm4,xmm12
+  psubw       xmm0,xmm3
+  psubw       xmm3,xmm14
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm11
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm12
+  psubw       xmm0,xmm14
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  movdqa      xmm3,[rsp]
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm9
+  psubw       xmm13,xmm15
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  paddw       xmm12,xmm6
+  psubw       xmm14,xmm6
+  movdqa      xmm2,[rsp+10h]
+  movaps      xmm6,[r11-18h]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm15
+  psubw       xmm9,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm15
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  pmaxsw      xmm11,xmm1
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm10,xmm0
+  pabsw       xmm0,xmm13
+  pminsw      xmm3,xmm11
+  movaps      xmm11,[r11-68h]
+  movaps      xmm13,[rsp+40h]
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm9
+  movaps      xmm9, [r11-48h]
+  pand        xmm10,xmm1
+  pcmpgtw     xmm8,xmm0
+  pand        xmm10,xmm8
+  pand        xmm10,xmm7
+  movaps      xmm8,[r11-38h]
+  movaps      xmm7,[r11-28h]
+  pand        xmm3,xmm10
+  paddw       xmm15,xmm3
+  psubw       xmm2,xmm3
+  movaps      xmm10,[r11-58h]
+  packuswb    xmm12,xmm15
+  movaps      xmm15,[rsp+20h]
+  packuswb    xmm14,xmm2
+  movq        [rcx],xmm12
+  movq        [rbx],xmm14
+  psrldq      xmm12,8
+  psrldq      xmm14,8
+  movq        [rdx],xmm12
+  movaps      xmm12,[r11-78h]
+  movq        [rdi],xmm14
+  movaps      xmm14,[rsp+30h]
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
+  ret
+
+WELS_EXTERN DeblockChromaEq4V_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+
+  mov         rbp, r8
+  mov         r8, rdx
+  mov         r9, rcx
+  mov         rcx, rdi
+  mov         rdx, rsi
+
+  sub         rsp,90h
+  pxor        xmm1,xmm1
+  mov         r11,rcx
+  mov         rbx,rdx
+  mov         r10d,r9d
+  movq        xmm13,[r11]
+  lea         eax,[r8+r8]
+  movsxd      r9,eax
+  mov         rax,rcx
+  sub         rax,r9
+  movq        xmm14,[rax]
+  mov         rax,rdx
+  sub         rax,r9
+  movq        xmm0,[rax]
+  movsxd      rax,r8d
+  sub         rcx,rax
+  sub         rdx,rax
+  movq        xmm12,[rax+r11]
+  movq        xmm10,[rcx]
+  punpcklqdq  xmm14,xmm0
+  movdqa      xmm8,xmm14
+  movq        xmm0,[rdx]
+  punpcklbw   xmm8,xmm1
+  punpckhbw   xmm14,xmm1
+  punpcklqdq  xmm10,xmm0
+  movq        xmm0,[rbx]
+  movdqa      xmm5,xmm10
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rax+rbx]
+  punpcklbw   xmm5,xmm1
+  movsx       eax,r10w
+  movdqa      xmm9,xmm13
+  punpcklqdq  xmm12,xmm0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm10,xmm1
+  movd        xmm0,eax
+  mov         eax, ebp   ; iBeta
+  punpckhbw   xmm13,xmm1
+  movdqa      xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm12,xmm1
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm7,xmm1
+  movd        xmm0,eax
+  movdqa      xmm1,xmm8
+  psubw       xmm1,xmm5
+  punpcklwd   xmm0,xmm0
+  movdqa      xmm6,xmm11
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm9
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm10
+  movdqa      xmm1,xmm14
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm10
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm11,xmm0
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm11,xmm2
+  movdqa      xmm0,xmm12
+  movdqa      xmm4,xmm6
+  movdqa      xmm1,xmm8
+  mov         eax,2
+  cwde
+  paddw       xmm1,xmm8
+  psubw       xmm0,xmm13
+  paddw       xmm1,xmm5
+  pabsw       xmm0,xmm0
+  movdqa      xmm2,xmm14
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm14
+  movd        xmm0,eax
+  pand        xmm11,xmm3
+  paddw       xmm7,xmm7
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  paddw       xmm2,xmm12
+  paddw       xmm12,xmm12
+  pshufd      xmm3,xmm0,0
+  paddw       xmm7,xmm9
+  paddw       xmm12,xmm13
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm5
+  paddw       xmm7,xmm8
+  psraw       xmm1,2
+  paddw       xmm12,xmm14
+  paddw       xmm7,xmm3
+  ;movaps      xmm14,[rsp]
+  pand        xmm4,xmm1
+  paddw       xmm12,xmm3
+  psraw       xmm7,2
+  movdqa      xmm1,xmm11
+  por         xmm4,xmm0
+  psraw       xmm12,2
+  paddw       xmm2,xmm3
+  movdqa      xmm0,xmm11
+  pandn       xmm0,xmm10
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  packuswb    xmm4,xmm1
+  movdqa      xmm0,xmm11
+  movdqa      xmm1,xmm6
+  pand        xmm1,xmm7
+  movq        [rcx],xmm4
+  pandn       xmm6,xmm9
+  pandn       xmm11,xmm13
+  pand        xmm0,xmm12
+  por         xmm1,xmm6
+  por         xmm0,xmm11
+  psrldq      xmm4,8
+  packuswb    xmm1,xmm0
+  movq        [r11],xmm1
+  psrldq      xmm1,8
+  movq        [rdx],xmm4
+  lea         r11,[rsp+90h]
+  movq        [rbx],xmm1
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
+  ret
+
+WELS_EXTERN DeblockChromaEq4H_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        r12
+
+  mov         rbp,   r8
+  mov         r8,    rdx
+  mov         r9,    rcx
+  mov         rcx,   rdi
+  mov         rdx,   rsi
+  mov         rdi,   rdx
+
+  sub         rsp,140h
+  lea         eax,[r8*4]
+  movsxd      r10,eax
+  mov         eax,[rcx-2]
+  mov         [rsp+10h],eax
+  lea         rbx,[r10+rdx-2]
+  lea         r11,[r10+rcx-2]
+
+  movdqa      xmm5,[rsp+10h]
+  movsxd      r10,r8d
+  mov         eax,[r10+rcx-2]
+  lea         rdx,[r10+r10*2]
+  mov         [rsp+20h],eax
+  mov         eax,[rcx+r10*2-2]
+  mov         [rsp+30h],eax
+  mov         eax,[rdx+rcx-2]
+  movdqa      xmm2,[rsp+20h]
+  mov         [rsp+40h],eax
+  mov         eax, [rdi-2]
+  movdqa      xmm4,[rsp+30h]
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rdi-2]
+  movdqa      xmm3,[rsp+40h]
+  mov         [rsp+60h],eax
+  mov         eax,[rdi+r10*2-2]
+  punpckldq   xmm5,[rsp+50h]
+  mov         [rsp+70h],eax
+  mov         eax, [rdx+rdi-2]
+  punpckldq   xmm2, [rsp+60h]
+  mov          [rsp+80h],eax
+  mov         eax,[r11]
+  punpckldq   xmm4, [rsp+70h]
+  mov         [rsp+50h],eax
+  mov         eax,[rbx]
+  punpckldq   xmm3,[rsp+80h]
+  mov         [rsp+60h],eax
+  mov         eax,[r10+r11]
+  movdqa      xmm0, [rsp+50h]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm0,[rsp+50h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+60h],eax
+  mov         eax,[r11+r10*2]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[rbx+r10*2]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  mov         eax, [rdx+r11]
+  movdqa      xmm15,xmm1
+  punpckldq   xmm0,[rsp+60h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax, [rdx+rbx]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm15,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm12,xmm15
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm12,xmm0
+  punpckhdq   xmm15,xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm11,xmm12
+  punpckldq   xmm0,xmm5
+  punpckhdq   xmm1,xmm5
+  punpcklqdq  xmm11,xmm0
+  punpckhqdq  xmm12,xmm0
+  movsx       eax,r9w
+  movdqa      xmm14,xmm15
+  punpcklqdq  xmm14,xmm1
+  punpckhqdq  xmm15,xmm1
+  pxor        xmm1,xmm1
+  movd        xmm0,eax
+  movdqa      xmm4,xmm12
+  movdqa      xmm8,xmm11
+  mov         eax, ebp ; iBeta
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm4,xmm1
+  punpckhbw   xmm12,xmm1
+  movdqa      xmm9,xmm14
+  movdqa      xmm7,xmm15
+  movdqa      xmm10,xmm15
+  pshufd      xmm13,xmm0,0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm14,xmm1
+  movdqa      xmm6,xmm13
+  movd        xmm0,eax
+  movdqa      [rsp],xmm11
+  mov         eax,2
+  cwde
+  punpckhbw   xmm11,xmm1
+  punpckhbw   xmm10,xmm1
+  punpcklbw   xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm8,xmm1
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm4
+  psubw       xmm0,xmm9
+  psubw       xmm1,xmm4
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm12
+  movdqa      xmm1,xmm11
+  psubw       xmm0,xmm14
+  psubw       xmm1,xmm12
+  movdqa      xmm5,xmm6
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm13,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm2,xmm0
+  paddw       xmm1,xmm8
+  movdqa      xmm0,xmm10
+  pand        xmm13,xmm2
+  psubw       xmm0,xmm14
+  paddw       xmm1,xmm4
+  movdqa      xmm2,xmm11
+  pabsw       xmm0,xmm0
+  paddw       xmm2,xmm11
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm12
+  movd        xmm0,eax
+  pand        xmm13,xmm3
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm4
+  paddw       xmm2,xmm3
+  psraw       xmm1,2
+  pand        xmm5,xmm1
+  por         xmm5,xmm0
+  paddw       xmm7,xmm7
+  paddw       xmm10,xmm10
+  psraw       xmm2,2
+  movdqa      xmm1,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm2
+  paddw       xmm7,xmm9
+  por         xmm1,xmm0
+  paddw       xmm10,xmm14
+  paddw       xmm7,xmm8
+  movdqa      xmm0,xmm13
+  packuswb    xmm5,xmm1
+  paddw       xmm7,xmm3
+  paddw       xmm10,xmm11
+  movdqa      xmm1,xmm6
+  paddw       xmm10,xmm3
+  pandn       xmm6,xmm9
+  psraw       xmm7,2
+  pand        xmm1,xmm7
+  psraw       xmm10,2
+  pandn       xmm13,xmm14
+  pand        xmm0,xmm10
+  por         xmm1,xmm6
+  movdqa      xmm6,[rsp]
+  movdqa      xmm4,xmm6
+  por         xmm0,xmm13
+  punpcklbw   xmm4,xmm5
+  punpckhbw   xmm6,xmm5
+  movdqa      xmm3,xmm4
+  packuswb    xmm1,xmm0
+  movdqa      xmm0,xmm1
+  punpckhbw   xmm1,xmm15
+  punpcklbw   xmm0,xmm15
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm6
+  movdqa      xmm2,xmm3
+  punpcklwd   xmm0,xmm1
+  punpckhwd   xmm6,xmm1
+  movdqa      xmm1,xmm4
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm6
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm6
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+10h],xmm0
+  movdqa      [rsp+60h],xmm2
+  movdqa      xmm0,xmm3
+  mov         eax,[rsp+10h]
+  mov         [rcx-2],eax
+  mov         eax,[rsp+60h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [r10+rcx-2],eax
+  movdqa      [rsp+20h],xmm0
+  mov         eax, [rsp+20h]
+  movdqa      [rsp+70h],xmm3
+  mov         [rcx+r10*2-2],eax
+  mov         eax,[rsp+70h]
+  mov         [rdx+rcx-2],eax
+  mov         eax,[rsp+18h]
+  mov         [r11],eax
+  mov         eax,[rsp+68h]
+  mov         [r10+r11],eax
+  mov         eax,[rsp+28h]
+  mov         [r11+r10*2],eax
+  mov         eax,[rsp+78h]
+  mov         [rdx+r11],eax
+  mov         eax,[rsp+14h]
+  mov         [rdi-2],eax
+  mov         eax,[rsp+64h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+24h]
+  mov         [rdi+r10*2-2],eax
+  mov         eax, [rsp+74h]
+  mov         [rdx+rdi-2],eax
+  mov         eax, [rsp+1Ch]
+  mov         [rbx],eax
+  mov         eax, [rsp+6Ch]
+  mov         [r10+rbx],eax
+  mov         eax,[rsp+2Ch]
+  mov         [rbx+r10*2],eax
+  mov         eax,[rsp+7Ch]
+  mov         [rdx+rbx],eax
+  lea         r11,[rsp+140h]
+  mov         rbx, [r11+28h]
+  mov         rsp,r11
+  pop         r12
+  pop         rbp
+  pop         rbx
+  ret
+
+
+WELS_EXTERN DeblockChromaLt4H_ssse3
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        r12
+  push        r13
+  push        r14
+  sub         rsp,170h
+
+  mov         r13,   r8
+  mov         r14,   r9
+  mov         r8,    rdx
+  mov         r9,    rcx
+  mov         rdx,   rdi
+  mov         rcx,   rsi
+
+  movsxd      rsi,r8d
+  lea         eax,[r8*4]
+  mov         r11d,r9d
+  movsxd      r10,eax
+  mov         eax, [rcx-2]
+  mov         r12,rdx
+  mov         [rsp+40h],eax
+  mov         eax, [rsi+rcx-2]
+  lea         rbx,[r10+rcx-2]
+  movdqa      xmm5,[rsp+40h]
+  mov         [rsp+50h],eax
+  mov         eax, [rcx+rsi*2-2]
+  lea         rbp,[r10+rdx-2]
+  movdqa      xmm2, [rsp+50h]
+  mov         [rsp+60h],eax
+  lea         r10,[rsi+rsi*2]
+  mov         rdi,rcx
+  mov         eax,[r10+rcx-2]
+  movdqa      xmm4,[rsp+60h]
+  mov         [rsp+70h],eax
+  mov         eax,[rdx-2]
+  mov         [rsp+80h],eax
+  mov         eax, [rsi+rdx-2]
+  movdqa      xmm3,[rsp+70h]
+  mov         [rsp+90h],eax
+  mov         eax,[rdx+rsi*2-2]
+  punpckldq   xmm5,[rsp+80h]
+  mov         [rsp+0A0h],eax
+  mov         eax, [r10+rdx-2]
+  punpckldq   xmm2,[rsp+90h]
+  mov         [rsp+0B0h],eax
+  mov         eax, [rbx]
+  punpckldq   xmm4,[rsp+0A0h]
+  mov         [rsp+80h],eax
+  mov         eax,[rbp]
+  punpckldq   xmm3,[rsp+0B0h]
+  mov         [rsp+90h],eax
+  mov         eax,[rsi+rbx]
+  movdqa      xmm0,[rsp+80h]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rsi+rbp]
+  movdqa      xmm0,[rsp+80h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+90h],eax
+  mov         eax,[rbx+rsi*2]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rbp+rsi*2]
+  movdqa      xmm0, [rsp+80h]
+  mov         [rsp+90h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm7,xmm1
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax, [r10+rbp]
+  movdqa      xmm0,[rsp+80h]
+  mov         [rsp+90h],eax
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm7,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm6,xmm7
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm6,xmm0
+  punpckhdq   xmm7,xmm0
+  movdqa      xmm0,xmm1
+  punpckldq   xmm0,xmm5
+  mov         rax, r14    ; pTC
+  punpckhdq   xmm1,xmm5
+  movdqa      xmm9,xmm6
+  punpckhqdq  xmm6,xmm0
+  punpcklqdq  xmm9,xmm0
+  movdqa      xmm2,xmm7
+  movdqa      xmm13,xmm6
+  movdqa      xmm4,xmm9
+  movdqa      [rsp+10h],xmm9
+  punpcklqdq  xmm2,xmm1
+  punpckhqdq  xmm7,xmm1
+  pxor        xmm1,xmm1
+  movsx       ecx,byte [rax+3]
+  movsx       edx,byte [rax+2]
+  movsx       r8d,byte [rax+1]
+  movsx       r9d,byte [rax]
+  movdqa      xmm10,xmm1
+  movdqa      xmm15,xmm2
+  punpckhbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm4,xmm1
+  movsx       eax,r11w
+  mov         word [rsp+0Eh],cx
+  mov         word [rsp+0Ch],cx
+  movdqa      xmm3,xmm7
+  movdqa      xmm8,xmm7
+  movdqa      [rsp+20h],xmm7
+  punpcklbw   xmm15,xmm1
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm3,xmm1
+  mov         word [rsp+0Ah],dx
+  mov         word [rsp+8],dx
+  mov         word [rsp+6],r8w
+  movd        xmm0,eax
+  movdqa      [rsp+30h],xmm6
+  punpckhbw   xmm9,xmm1
+  punpckhbw   xmm8,xmm1
+  punpcklwd   xmm0,xmm0
+  mov         eax, r13d   ; iBeta
+  mov         word [rsp+4],r8w
+  mov         word [rsp+2],r9w
+  pshufd      xmm12,xmm0,0
+  mov         word [rsp],r9w
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  movdqa      xmm14, [rsp]
+  movdqa      [rsp],xmm2
+  movdqa      xmm2,xmm12
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  psubw       xmm10,xmm14
+  movd        xmm0,eax
+  movdqa      xmm7,xmm14
+  movdqa      xmm6,xmm14
+  pcmpgtw     xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  movdqa      xmm0,xmm4
+  movdqa      xmm1,xmm15
+  psubw       xmm4,xmm13
+  psubw       xmm0,xmm3
+  psubw       xmm1,xmm13
+  psubw       xmm3,xmm15
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm10
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm11
+  movdqa      xmm0,xmm13
+  psubw       xmm0,xmm15
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm11
+  movdqa      xmm3,[rsp+30h]
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm9
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm8
+  psubw       xmm9,xmm3
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  psubw       xmm15,xmm6
+  paddw       xmm13,xmm6
+  movdqa      xmm2,[rsp]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  psubw       xmm8,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm3
+  movdqa      xmm5,[rsp+10h]
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  movdqa      xmm4,xmm5
+  pabsw       xmm0,xmm0
+  pmaxsw      xmm10,xmm1
+  movdqa      xmm1,xmm11
+  pcmpgtw     xmm12,xmm0
+  pabsw       xmm0,xmm9
+  pminsw      xmm14,xmm10
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm8
+  pcmpgtw     xmm11,xmm0
+  pand        xmm12,xmm1
+  movdqa      xmm1,[rsp+20h]
+  pand        xmm12,xmm11
+  pand        xmm12,xmm7
+  pand        xmm14,xmm12
+  paddw       xmm3,xmm14
+  psubw       xmm2,xmm14
+  packuswb    xmm13,xmm3
+  packuswb    xmm15,xmm2
+  punpcklbw   xmm4,xmm13
+  punpckhbw   xmm5,xmm13
+  movdqa      xmm0,xmm15
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm4
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm2,xmm3
+  movdqa      xmm1,xmm4
+  punpcklwd   xmm0,xmm15
+  punpckhwd   xmm5,xmm15
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm5
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm5
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+40h],xmm0
+  movdqa      xmm0,xmm3
+  movdqa      [rsp+90h],xmm2
+  mov         eax,[rsp+40h]
+  mov         [rdi-2],eax
+  mov         eax, [rsp+90h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [rsi+rdi-2],eax
+  movdqa      [rsp+50h],xmm0
+  mov         eax,[rsp+50h]
+  movdqa      [rsp+0A0h],xmm3
+  mov         [rdi+rsi*2-2],eax
+  mov         eax,[rsp+0A0h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+48h]
+  mov         [rbx],eax
+  mov         eax,[rsp+98h]
+  mov         [rsi+rbx],eax
+  mov         eax,[rsp+58h]
+  mov         [rbx+rsi*2],eax
+  mov         eax, [rsp+0A8h]
+  mov         [r10+rbx],eax
+  mov         eax, [rsp+44h]
+  mov         [r12-2],eax
+  mov         eax,[rsp+94h]
+  mov         [rsi+r12-2],eax
+  mov         eax,[rsp+54h]
+  mov         [r12+rsi*2-2],eax
+  mov         eax, [rsp+0A4h]
+  mov         [r10+r12-2],eax
+  mov         eax,[rsp+4Ch]
+  mov         [rbp],eax
+  mov         eax,[rsp+9Ch]
+  mov         [rsi+rbp],eax
+  mov         eax, [rsp+5Ch]
+  mov         [rbp+rsi*2],eax
+  mov         eax,[rsp+0ACh]
+  mov         [r10+rbp],eax
+  lea         r11,[rsp+170h]
+  mov         rsp,r11
+  pop         r14
+  pop         r13
+  pop         r12
+  pop         rbp
+  pop         rbx
+  ret
+
+
+
+%elifdef  X86_32
+
+;********************************************************************************
+;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_ssse3
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_ssse3
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;***************************************************************************
+;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_ssse3
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+;*******************************************************************************
+;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4H_ssse3
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
+;*******************************************************************************
+;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaLt4V_ssse3
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+
+WELS_EXTERN  DeblockLumaEq4V_ssse3
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+%endif
+
+
+
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+    push     r3
+    push     r4
+    push     r5
+
+%assign   push_num   3
+    LOAD_3_PARA
+    PUSH_XMM 8
+
+    SIGN_EXTENSION   r1, r1d
+
+    mov      r5,    r7
+    mov      r3,    r7
+    and      r3,    0Fh
+    sub      r7,    r3
+    sub      r7,    10h
+
+    lea      r3,    [r0 + r1 * 8]
+    lea      r4,    [r1 * 3]
+
+    movq    xmm0,  [r0]
+    movq    xmm7,  [r3]
+    punpcklqdq   xmm0,  xmm7
+    movq    xmm1,  [r0 + r1]
+    movq    xmm7,  [r3 + r1]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [r0 + r1*2]
+    movq    xmm7,  [r3 + r1*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [r0 + r4]
+    movq    xmm7,  [r3 + r4]
+    punpcklqdq   xmm3,  xmm7
+
+    lea     r0,   [r0 + r1 * 4]
+    lea     r3,   [r3 + r1 * 4]
+    movq    xmm4,  [r0]
+    movq    xmm7,  [r3]
+    punpcklqdq   xmm4,  xmm7
+    movq    xmm5,  [r0 + r1]
+    movq    xmm7,  [r3 + r1]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [r0 + r1*2]
+    movq    xmm7,  [r3 + r1*2]
+    punpcklqdq   xmm6,  xmm7
+
+    movdqa  [r7],   xmm0
+    movq    xmm7,  [r0 + r4]
+    movq    xmm0,  [r3 + r4]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [r7]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    movdqa  [r2],    xmm4
+    movdqa  [r2 + 10h],  xmm2
+    movdqa  [r2 + 20h],  xmm3
+    movdqa  [r2 + 30h],  xmm7
+    movdqa  [r2 + 40h],  xmm5
+    movdqa  [r2 + 50h],  xmm1
+    movdqa  [r2 + 60h],  xmm6
+    movdqa  [r2 + 70h],  xmm0
+
+    mov     r7,   r5
+    POP_XMM
+    pop     r5
+    pop     r4
+    pop     r3
+    ret
+
+
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+    push     r3
+    push     r4
+
+%assign  push_num 2
+    LOAD_3_PARA
+    PUSH_XMM 8
+
+    SIGN_EXTENSION   r1, r1d
+
+    mov      r4,    r7
+    mov      r3,    r7
+    and      r3,    0Fh
+    sub      r7,    r3
+    sub      r7,    10h
+
+    movdqa   xmm0,   [r2]
+    movdqa   xmm1,   [r2 + 10h]
+    movdqa   xmm2,   [r2 + 20h]
+    movdqa   xmm3,   [r2 + 30h]
+    movdqa   xmm4,   [r2 + 40h]
+    movdqa   xmm5,   [r2 + 50h]
+    movdqa   xmm6,   [r2 + 60h]
+    movdqa   xmm7,   [r2 + 70h]
+
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+    lea      r2,   [r1 * 3]
+
+    movq     [r0],  xmm4
+    movq     [r0 + r1],  xmm2
+    movq     [r0 + r1*2],  xmm3
+    movq     [r0 + r2],  xmm7
+
+    lea      r0,   [r0 + r1*4]
+    movq     [r0],  xmm5
+    movq     [r0 + r1],  xmm1
+    movq     [r0 + r1*2],  xmm6
+    movq     [r0 + r2],  xmm0
+
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+
+    lea       r0,  [r0 + r1*4]
+    movq     [r0],  xmm4
+    movq     [r0 + r1],  xmm2
+    movq     [r0 + r1*2],  xmm3
+    movq     [r0 + r2],  xmm7
+
+    lea      r0,   [r0 + r1*4]
+    movq     [r0],  xmm5
+    movq     [r0 + r1],  xmm1
+    movq     [r0 + r1*2],  xmm6
+    movq     [r0 + r2],  xmm0
+
+
+    mov      r7,   r4
+    POP_XMM
+    pop      r4
+    pop      r3
+    ret
+
--- /dev/null
+++ b/codec/common/x86/expand_picture.asm
@@ -1,0 +1,728 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  expand_picture.asm
+;*
+;*  Abstract
+;*      mmxext/sse for expand_frame
+;*
+;*  History
+;*      09/25/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+
+SECTION .text
+
+
+;;;;;;;expanding result;;;;;;;
+
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;----------------------------
+;aaaa|attttttttttttttttb|bbbb
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;llll|l                r|rrrr
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;----------------------------
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+
+%macro mov_line_8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end8x4_mmx		3	; dst, stride, mm?
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+2*%2]
+	movq [%1], %3
+	movq [%1+%2], %3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end16x4_sse2	4	; dst, stride, xmm?, u/a
+	movdq%4 [%1], %3 		; top(bottom)_0
+	movdq%4 [%1+%2], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdq%4 [%1], %3 		; top(bottom)_2
+	movdq%4 [%1+%2], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end32x4_sse2	3	; dst, stride, xmm?
+	movdqa [%1], %3 		; top(bottom)_0
+	movdqa [%1+16], %3 		; top(bottom)_0
+	movdqa [%1+%2], %3		; top(bottom)_1
+	movdqa [%1+%2+16], %3		; top(bottom)_1
+	lea %1, [%1+2*%2]
+	movdqa [%1], %3 		; top(bottom)_2
+	movdqa [%1+16], %3 		; top(bottom)_2
+	movdqa [%1+%2], %3		; top(bottom)_3
+	movdqa [%1+%2+16], %3		; top(bottom)_3
+	lea %1, [%1+%2]
+%endmacro
+
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
+    ;r2 [width/16(8)]
+    ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
+    ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
+
+%if %1 == 32		; for luma
+	sar r2, 04h 	; width / 16(8) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [r0]		; first line of picture pData
+	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+	; bottom
+	movdqa xmm1, [r3] 		; last line of picture pData
+	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+	lea r0, [r0+16]		; top pSrc
+	lea r5, [r5+16]		; top dst
+	lea r3, [r3+16]		; bottom pSrc
+	lea r4, [r4+16]		; bottom dst
+	neg r1 			; positive/negative stride need for next loop?
+
+	dec r2
+	jnz near .top_bottom_loops
+%elif %1 == 16	; for chroma ??
+	mov r6, r2
+	sar r2, 04h 	; (width / 16) pixels
+.top_bottom_loops:
+	; top
+	movdqa xmm0, [r0]		; first line of picture pData
+	mov_line_16x4_sse2 r5, r1, xmm0, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_16x4_sse2 r5, r1, xmm0, a
+	mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+	; bottom
+	movdqa xmm1, [r3] 		; last line of picture pData
+	mov_line_16x4_sse2 r4, r1, xmm1, a	; dst, stride, xmm?
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_16x4_sse2 r4, r1, xmm1, a
+	mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+	lea r0, [r0+16]		; top pSrc
+	lea r5, [r5+16]		; top dst
+	lea r3, [r3+16]		; bottom pSrc
+	lea r4, [r4+16]		; bottom dst
+	neg r1 			; positive/negative stride need for next loop?
+
+	dec r2
+	jnz near .top_bottom_loops
+
+	; for remaining 8 bytes
+	and r6, 0fh		; any 8 bytes left?
+	test r6, r6
+	jz near .to_be_continued	; no left to exit here
+
+	; top
+	movq mm0, [r0]		; remained 8 byte
+	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	mov_line_8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	mov_line_end8x4_mmx r5, r1, mm0	; dst, stride, mm?
+	; bottom
+	movq mm1, [r3]
+	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	mov_line_8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	mov_line_end8x4_mmx r4, r1, mm1	; dst, stride, mm?
+	WELSEMMS
+
+.to_be_continued:
+%endif
+%endmacro
+
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+    ;r6 [height]
+    ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
+    ;r3 [pSrc+(w-1)] r4[pSrc+w]
+
+%if %1 == 32		; for luma
+.left_right_loops:
+	; left
+	movzx r2d, byte [r0]		; pixel pData for left border
+	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [r5], xmm0
+	movdqa [r5+16], xmm0
+
+	; right
+	movzx r2d, byte [r3]
+	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [r4], xmm1
+	movdqa [r4+16], xmm1
+
+	lea r0, [r0+r1]		; left pSrc
+	lea r5, [r5+r1]		; left dst
+	lea r3, [r3+r1]		; right pSrc
+	lea r4, [r4+r1]		; right dst
+
+	dec r6
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
+.left_right_loops:
+	; left
+	movzx r2d, byte [r0]		; pixel pData for left border
+	SSE2_Copy16Times	xmm0, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdqa [r5], xmm0
+
+	; right
+	movzx r2d, byte [r3]
+	SSE2_Copy16Times	xmm1, r2d				; dst, tmp, pSrc [generic register name: a/b/c/d]
+	movdq%2 [r4], xmm1								; might not be aligned 16 bytes in case chroma planes
+
+	lea r0, [r0+r1]		; left pSrc
+	lea r5, [r5+r1]		; left dst
+	lea r3, [r3+r1]		; right pSrc
+	lea r4, [r4+r1]		; right dst
+
+	dec r6
+	jnz near .left_right_loops
+%endif
+%endmacro
+
+%macro exp_cross_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
+	; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+	; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+    ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
+%if %1 == 32		; luma
+	; TL
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r3, r1, xmm3	; dst, stride, xmm?
+
+	; TR
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r4, r1, xmm4	; dst, stride, xmm?
+
+	; BL
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r5, r1, xmm5	; dst, stride, xmm?
+
+	; BR
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+	mov_line_end32x4_sse2	r6, r1, xmm6	; dst, stride, xmm?
+%elif %1 == 16	; chroma
+	; TL
+	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+	mov_line_end16x4_sse2	r3, r1, xmm3, a	; dst, stride, xmm?
+
+	; TR
+	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r4, r1, xmm4, %2	; dst, stride, xmm?
+	mov_line_end16x4_sse2 r4, r1, xmm4, %2	; dst, stride, xmm?
+
+	; BL
+	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+	mov_line_16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+	mov_line_end16x4_sse2	r5, r1, xmm5, a	; dst, stride, xmm?
+
+	; BR
+	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+	mov_line_16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+	mov_line_end16x4_sse2	r6, r1, xmm6, %2	; dst, stride, xmm?
+%endif
+%endmacro
+
+;***********************************************************************----------------
+; void ExpandPictureLuma_sse2(	uint8_t *pDst,
+;									const int32_t iStride,
+;									const int32_t iWidth,
+;									const int32_t iHeight	);
+;***********************************************************************----------------
+WELS_EXTERN ExpandPictureLuma_sse2
+
+    push r4
+    push r5
+    push r6
+
+    %assign push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 7
+
+    SIGN_EXTENSION r1, r1d
+    SIGN_EXTENSION r2, r2d
+    SIGN_EXTENSION r3, r3d
+
+    ;also prepare for cross border pData top-left:xmm3
+
+    movzx r6d,byte[r0]
+    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
+
+    neg r1
+    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
+    neg r1
+
+    push r3
+
+
+    dec r3                      ;h-1
+    imul r3,r1                  ;(h-1)*stride
+    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
+
+    mov r6,r1                    ;r6 = stride
+    sal r6,05h                   ;r6 = 32*stride
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
+    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+    movzx r6d,byte [r3]             ;bottom-left
+    SSE2_Copy16Times xmm5,r6d
+
+    lea r6,[r3+r2-1]
+    movzx r6d,byte [r6]
+    SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+    neg r1  ;r1 = -stride
+
+    push r0
+    push r1
+    push r2
+
+    exp_top_bottom_sse2 32
+
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    pop r2
+    pop r1
+    pop r0
+
+    lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
+
+    lea r3,[r0+r2-1]                        ;right border src
+    lea r4,[r3+1]                           ;right border dst
+
+    ;prepare for cross border data: top-rigth with xmm4
+     movzx r6d,byte [r3]                         ;top -rigth
+     SSE2_Copy16Times xmm4,r6d
+
+    neg r1   ;r1 = stride
+
+
+    pop r6  ;  r6 = height
+
+
+
+    push r0
+    push r1
+    push r2
+    push r6
+
+    exp_left_right_sse2  32,a
+
+    pop r6
+    pop r2
+    pop r1
+    pop r0
+
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+    neg r1  ;r1 = -stride
+    lea r3,[r0-32]
+    lea r3,[r3+r1]    ;last line of top-left border
+
+    lea r4,[r0+r2]    ;psrc +width
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
+    neg r1  ;r1 = stride
+    add r6,32         ;height +32(16) ,luma = 32, chroma = 16
+    imul r6,r1
+
+    lea r5,[r3+r6]    ;last line of bottom-left border
+    lea r6,[r4+r6]    ;last line of botoom-right border
+
+    neg r1 ; r1 = -stride
+
+    ; for left & right border expanding
+    exp_cross_sse2 32,a
+
+    POP_XMM
+    LOAD_4_PARA_POP
+
+    pop r6
+    pop r5
+    pop r4
+
+    %assign push_num 0
+
+
+	ret
+
+;***********************************************************************----------------
+; void ExpandPictureChromaAlign_sse2(	uint8_t *pDst,
+;										const int32_t iStride,
+;										const int32_t iWidth,
+;										const int32_t iHeight	);
+;***********************************************************************----------------
+WELS_EXTERN ExpandPictureChromaAlign_sse2
+
+    push r4
+    push r5
+    push r6
+
+    %assign push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 7
+
+    SIGN_EXTENSION r1,r1d
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+
+    ;also prepare for cross border pData top-left:xmm3
+
+    movzx r6d,byte [r0]
+    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
+
+    neg r1
+    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
+    neg r1
+
+    push r3
+
+
+    dec r3                      ;h-1
+    imul r3,r1                  ;(h-1)*stride
+    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
+
+    mov r6,r1                    ;r6 = stride
+    sal r6,04h                   ;r6 = 32*stride
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
+    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+    movzx r6d,byte [r3]             ;bottom-left
+    SSE2_Copy16Times xmm5,r6d
+
+    lea r6,[r3+r2-1]
+    movzx r6d,byte [r6]
+    SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+    neg r1  ;r1 = -stride
+
+    push r0
+    push r1
+    push r2
+
+    exp_top_bottom_sse2 16
+
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    pop r2
+    pop r1
+    pop r0
+
+    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
+
+    lea r3,[r0+r2-1]                        ;right border src
+    lea r4,[r3+1]                           ;right border dst
+
+    ;prepare for cross border data: top-rigth with xmm4
+    movzx r6d,byte [r3]                         ;top -rigth
+    SSE2_Copy16Times xmm4,r6d
+
+    neg r1   ;r1 = stride
+
+
+    pop r6  ;  r6 = height
+
+
+
+    push r0
+    push r1
+    push r2
+	push r6
+    exp_left_right_sse2 16,a
+
+    pop r6
+    pop r2
+    pop r1
+    pop r0
+
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+    neg r1  ;r1 = -stride
+    lea r3,[r0-16]
+    lea r3,[r3+r1]    ;last line of top-left border
+
+    lea r4,[r0+r2]    ;psrc +width
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
+    neg r1  ;r1 = stride
+    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
+    imul r6,r1
+
+    lea r5,[r3+r6]    ;last line of bottom-left border
+    lea r6,[r4+r6]    ;last line of botoom-right border
+
+    neg r1 ; r1 = -stride
+
+    ; for left & right border expanding
+    exp_cross_sse2 16,a
+
+    POP_XMM
+    LOAD_4_PARA_POP
+
+    pop r6
+    pop r5
+    pop r4
+
+    %assign push_num 0
+
+
+	ret
+
+;***********************************************************************----------------
+; void ExpandPictureChromaUnalign_sse2(	uint8_t *pDst,
+;										const int32_t iStride,
+;										const int32_t iWidth,
+;										const int32_t iHeight	);
+;***********************************************************************----------------
+WELS_EXTERN ExpandPictureChromaUnalign_sse2
+	push r4
+    push r5
+    push r6
+
+    %assign push_num 3
+    LOAD_4_PARA
+    PUSH_XMM 7
+
+    SIGN_EXTENSION r1,r1d
+    SIGN_EXTENSION r2,r2d
+    SIGN_EXTENSION r3,r3d
+
+    ;also prepare for cross border pData top-left:xmm3
+
+    movzx r6d,byte [r0]
+    SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
+
+    neg r1
+    lea r5,[r0+r1]              ;last line of top border r5= dst top  pSrc[-stride]
+    neg r1
+
+    push r3
+
+
+    dec r3                      ;h-1
+    imul r3,r1                  ;(h-1)*stride
+    lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
+
+    mov r6,r1                    ;r6 = stride
+    sal r6,04h                   ;r6 = 32*stride
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
+    ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+    movzx r6d,byte [r3]             ;bottom-left
+    SSE2_Copy16Times xmm5,r6d
+
+    lea r6,[r3+r2-1]
+    movzx r6d,byte [r6]
+    SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+    neg r1  ;r1 = -stride
+
+    push r0
+    push r1
+    push r2
+
+    exp_top_bottom_sse2 16
+
+	; for both left and right border
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+    pop r2
+    pop r1
+    pop r0
+
+    lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
+
+    lea r3,[r0+r2-1]                        ;right border src
+    lea r4,[r3+1]                           ;right border dst
+
+    ;prepare for cross border data: top-rigth with xmm4
+    movzx r6d,byte [r3]                         ;top -rigth
+    SSE2_Copy16Times xmm4,r6d
+
+    neg r1   ;r1 = stride
+
+
+    pop r6  ;  r6 = height
+
+
+
+    push r0
+    push r1
+    push r2
+	push r6
+    exp_left_right_sse2 16,u
+
+    pop r6
+    pop r2
+    pop r1
+    pop r0
+
+	; for cross border [top-left, top-right, bottom-left, bottom-right]
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+    ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+    neg r1  ;r1 = -stride
+    lea r3,[r0-16]
+    lea r3,[r3+r1]    ;last line of top-left border
+
+    lea r4,[r0+r2]    ;psrc +width
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
+    neg r1  ;r1 = stride
+    add r6,16         ;height +32(16) ,luma = 32, chroma = 16
+    imul r6,r1
+
+    lea r5,[r3+r6]    ;last line of bottom-left border
+    lea r6,[r4+r6]    ;last line of botoom-right border
+
+    neg r1 ; r1 = -stride
+
+    ; for left & right border expanding
+    exp_cross_sse2 16,u
+
+    POP_XMM
+    LOAD_4_PARA_POP
+
+    pop r6
+    pop r5
+    pop r4
+
+    %assign push_num 0
+
+
+	ret
--- /dev/null
+++ b/codec/common/x86/mb_copy.asm
@@ -1,0 +1,581 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mb_copy.asm
+;*
+;*  Abstract
+;*      mb_copy and mb_copy1
+;*
+;*  History
+;*      15/09/2009 Created
+;*		12/28/2009 Modified with larger throughput
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+
+;***********************************************************************
+; void WelsCopy16x16_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy16x16_sse2
+
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+2*r3]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+2*r3]
+	movdqa xmm7, [r2+r5]
+	lea r2, [r2+4*r3]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	lea r0, [r0+4*r1]
+
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+2*r3]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+2*r3]
+	movdqa xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2(	uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+	lea r2, [r2+4*r3]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	lea r0, [r0+4*r1]
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+;							int32_t  iStrideD,
+;							uint8_t* Src,
+;							int32_t  iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy16x8NotAligned_sse2
+	push r4
+	push r5
+	%assign  push_num 2
+    LOAD_4_PARA
+    PUSH_XMM 8
+
+	lea r4, [r1+2*r1]	;ebx, [eax+2*eax]	; x3
+	lea r5, [r3+2*r3]	;edx, [ecx+2*ecx]	; x3
+
+	movdqu xmm0, [r2]
+	movdqu xmm1, [r2+r3]
+	movdqu xmm2, [r2+2*r3]
+	movdqu xmm3, [r2+r5]
+	lea r2, [r2+4*r3]
+	movdqu xmm4, [r2]
+	movdqu xmm5, [r2+r3]
+	movdqu xmm6, [r2+2*r3]
+	movdqu xmm7, [r2+r5]
+
+	movdqa [r0], xmm0
+	movdqa [r0+r1], xmm1
+	movdqa [r0+2*r1], xmm2
+	movdqa [r0+r4], xmm3
+	lea r0, [r0+4*r1]
+	movdqa [r0], xmm4
+	movdqa [r0+r1], xmm5
+	movdqa [r0+2*r1], xmm6
+	movdqa [r0+r4], xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+;                       int32_t  iStrideD,
+;                       uint8_t* Src,
+;                       int32_t  iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy8x16_mmx
+	%assign  push_num 0
+    LOAD_4_PARA
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+	lea r2, [r2+2*r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+	lea r0, [r0+2*r1]
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+
+	WELSEMMS
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+; void WelsCopy8x8_mmx(  uint8_t* Dst,
+;                        int32_t  iStrideD,
+;                        uint8_t* Src,
+;                        int32_t  iStrideS )
+;***********************************************************************
+WELS_EXTERN WelsCopy8x8_mmx
+	push r4
+	%assign  push_num 1
+    LOAD_4_PARA
+	lea r4, [r3+2*r3]	;edx, [ebx+2*ebx]
+
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	; to prefetch next loop
+	prefetchnta [r2+2*r3]
+	prefetchnta [r2+r4]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+	movq mm7, [r2+r3]
+
+	movq [r0], mm0
+	movq [r0+r1], mm1
+	lea r0, [r0+2*r1]
+	movq [r0], mm2
+	movq [r0+r1], mm3
+	lea r0, [r0+2*r1]
+	movq [r0], mm4
+	movq [r0+r1], mm5
+	lea r0, [r0+2*r1]
+	movq [r0], mm6
+	movq [r0+r1], mm7
+
+	WELSEMMS
+	LOAD_4_PARA_POP
+	pop r4
+	ret
+
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+WELS_EXTERN UpdateMbMv_sse2
+
+    %assign  push_num 0
+    LOAD_2_PARA
+
+	movd xmm0, r1d	; _mv
+	pshufd xmm1, xmm0, $00
+	movdqa [r0     ], xmm1
+	movdqa [r0+0x10], xmm1
+	movdqa [r0+0x20], xmm1
+	movdqa [r0+0x30], xmm1
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+
+
+;*******************************************************************************
+; void PixelAvgWidthEq4_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+WELS_EXTERN PixelAvgWidthEq4_mmx
+
+    %assign  push_num 0
+    LOAD_7_PARA
+
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+	SIGN_EXTENSION	r6, r6d
+
+ALIGN 4
+.height_loop:
+	movd        mm0, [r4]
+    pavgb       mm0, [r2]
+    movd        [r0], mm0
+
+    dec         r6
+    lea         r0, [r0+r1]
+    lea         r2, [r2+r3]
+    lea         r4, [r4+r5]
+    jne         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+
+;*******************************************************************************
+; void PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
+;                           uint8_t *pSrcA, int iSrcAStride,
+;                           uint8_t *pSrcB, int iSrcBStride,
+;                           int iHeight );
+;*******************************************************************************
+WELS_EXTERN PixelAvgWidthEq8_mmx
+    %assign  push_num 0
+    LOAD_7_PARA
+
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+	SIGN_EXTENSION	r6, r6d
+
+ALIGN 4
+.height_loop:
+	movq        mm0, [r2]
+    pavgb       mm0, [r4]
+    movq        [r0], mm0
+    movq        mm0, [r2+r3]
+    pavgb       mm0, [r4+r5]
+    movq		[r0+r1], mm0
+
+    lea			r2,  [r2+2*r3]
+    lea			r4,  [r4+2*r5]
+    lea			r0,  [r0+2*r1]
+
+    sub         r6, 2
+    jnz         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+
+
+;*******************************************************************************
+; void PixelAvgWidthEq16_sse2( uint8_t *pDst,  int iDstStride,
+;                          uint8_t *pSrcA, int iSrcAStride,
+;                          uint8_t *pSrcB, int iSrcBStride,
+;                          int iHeight );
+;*******************************************************************************
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+    %assign  push_num 0
+    LOAD_7_PARA
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+	SIGN_EXTENSION	r6, r6d
+ALIGN 4
+.height_loop:
+	movdqu      xmm0, [r2]
+	movdqu	    xmm1, [r4]
+	pavgb	    xmm0, xmm1
+	;pavgb       xmm0, [r4]
+    movdqu      [r0], xmm0
+
+	movdqu      xmm0, [r2+r3]
+	movdqu      xmm1, [r4+r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+r1], xmm0
+
+	movdqu      xmm0, [r2+2*r3]
+	movdqu       xmm1, [r4+2*r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+2*r1], xmm0
+
+    lea         r2, [r2+2*r3]
+    lea			r4, [r4+2*r5]
+    lea			r0, [r0+2*r1]
+
+	movdqu      xmm0, [r2+r3]
+	movdqu      xmm1, [r4+r5]
+	pavgb	    xmm0, xmm1
+    movdqu      [r0+r1], xmm0
+
+    lea         r2, [r2+2*r3]
+    lea			r4, [r4+2*r5]
+    lea			r0, [r0+2*r1]
+
+    sub         r6, 4
+    jne         .height_loop
+
+	WELSEMMS
+	LOAD_7_PARA_POP
+    ret
+
+;*******************************************************************************
+;  void McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+;                          uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+WELS_EXTERN McCopyWidthEq4_mmx
+    push	r5
+    %assign  push_num 1
+    LOAD_5_PARA
+
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+
+ALIGN 4
+.height_loop:
+	mov r5d, [r0]
+	mov [r2], r5d
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+	WELSEMMS
+    LOAD_5_PARA_POP
+    pop	   r5
+    ret
+
+;*******************************************************************************
+;   void McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+;                           uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+WELS_EXTERN McCopyWidthEq8_mmx
+    %assign  push_num 0
+    LOAD_5_PARA
+
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+
+ALIGN 4
+.height_loop:
+	movq mm0, [r0]
+	movq [r2], mm0
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+
+	WELSEMMS
+	LOAD_5_PARA_POP
+    ret
+
+
+;*******************************************************************************
+;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+	movq	%1, [%2]
+	movhps	%1,	[%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+	movq	[%1],	%2
+	movhps	[%1+8], %2
+%endmacro
+WELS_EXTERN McCopyWidthEq16_sse2
+    %assign  push_num 0
+    LOAD_5_PARA
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+ALIGN 4
+.height_loop:
+    SSE_READ_UNA	xmm0, r0
+    SSE_READ_UNA	xmm1, r0+r1
+    SSE_WRITE_UNA	r2, xmm0
+    SSE_WRITE_UNA	r2+r3, xmm1
+
+	sub		r4,	2
+    lea     r0, [r0+r1*2]
+    lea     r2, [r2+r3*2]
+    jnz     .height_loop
+
+	LOAD_5_PARA_POP
+    ret
--- /dev/null
+++ b/codec/common/x86/mc_chroma.asm
@@ -1,0 +1,293 @@
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( const uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							const uint8_t *pABCD,
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+	%assign  push_num 0
+	LOAD_6_PARA
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+
+	movd mm3, [r4];	[eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+
+	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+	movd mm0, [r0]
+	movd mm1, [r0+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+
+	movd  mm1, [r4]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+
+	movd mm1, [r4+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [r2], mm0
+
+	movq mm0, mm2
+
+	lea r2, [r2 + r3]
+	lea r4, [r4 + r1]
+
+	dec r5
+	jnz near .xloop
+	WELSEMMS
+	LOAD_6_PARA_POP
+	ret
+
+
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						const uint8_t *pABCD,
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+	%assign  push_num 0
+	LOAD_6_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+
+	movd xmm3, [r4]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+
+	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+	movq xmm0, [r0]
+	movq xmm1, [r0+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+
+	movq  xmm1, [r4]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+
+	movq xmm1, [r4+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	movdqa xmm0, xmm2
+
+	lea r2, [r2 + r3]
+	lea r4, [r4 + r1]
+
+	dec r5
+	jnz near .xloop
+
+	POP_XMM
+	LOAD_6_PARA_POP
+
+	ret
+
+
+
+
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
+;                        int32_t iDstStride,
+;                        const uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+	%assign  push_num 0
+	LOAD_6_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r5, r5d
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [r4]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6
+
+    sub r2, r3 ;sub esi, edi
+    sub r2, r3
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [r0]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+	lea	r2, [r2+2*r3]
+
+	movdqu xmm2, [r0+r1]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [r2],xmm0
+
+    lea r0, [r0+2*r1]
+    movdqu xmm2, [r0]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [r2+r3],xmm4
+
+	sub r5, 2
+	jnz .hloop_chroma
+
+	POP_XMM
+	LOAD_6_PARA_POP
+
+	ret
+
+
--- /dev/null
+++ b/codec/common/x86/mc_luma.asm
@@ -1,0 +1,1164 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_luma.asm
+;*
+;*  Abstract
+;*      sse2 motion compensation
+;*
+;*  History
+;*      17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+SECTION .rodata align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+	dw 16, 16, 16, 16
+ALIGN 16
+h264_w0x10_1:
+	dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+	dw 32, 32, 32, 32, 32, 32, 32, 32
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+
+
+;*******************************************************************************
+; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
+;						int iHeight)
+;*******************************************************************************
+WELS_EXTERN McHorVer20WidthEq4_mmx
+    %assign  push_num 0
+    LOAD_5_PARA
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+
+	sub r0, 2
+	WELS_Zero mm7
+	movq mm6, [h264_w0x10]
+.height_loop:
+	movd mm0, [r0]
+	punpcklbw mm0, mm7
+	movd mm1, [r0+5]
+	punpcklbw mm1, mm7
+	movd mm2, [r0+1]
+	punpcklbw mm2, mm7
+	movd mm3, [r0+4]
+	punpcklbw mm3, mm7
+	movd mm4, [r0+2]
+	punpcklbw mm4, mm7
+	movd mm5, [r0+3]
+	punpcklbw mm5, mm7
+
+	paddw mm2, mm3
+	paddw mm4, mm5
+	psllw mm4, 2
+	psubw mm4, mm2
+	paddw mm0, mm1
+	paddw mm0, mm4
+	psllw mm4, 2
+	paddw mm0, mm4
+	paddw mm0, mm6
+	psraw mm0, 5
+	packuswb mm0, mm7
+	movd [r2], mm0
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .height_loop
+
+	WELSEMMS
+	LOAD_5_PARA_POP
+	ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+	movq %1, %3
+	punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+	paddw	%1, %6
+	movdqa	%8, %3
+	movdqa	%7, %2
+	paddw	%1, [h264_w0x10_1]
+	paddw	%8, %4
+	paddw	%7, %5
+	psllw	%8, 2
+	psubw	%8, %7
+	paddw	%1, %8
+	psllw	%8, 2
+	paddw	%1, %8
+	psraw   %1, 5
+	WELS_Zero %8
+	packuswb %1, %8
+	movq    %9, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
+;                       int16_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride
+;						int32_t iHeight
+;                       )
+;***********************************************************************
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+	%assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	pxor xmm7, xmm7
+
+	sub r0, r1				;;;;;;;;need more 5 lines.
+	sub r0, r1
+
+.yloop_width_8:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [r2], xmm0
+
+	add r0, r1
+	add r2, r3
+	dec r4
+	jnz .yloop_width_8
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+;*******************************************************************************
+; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
+;												int iHeight,
+;                      );
+;*******************************************************************************
+WELS_EXTERN McHorVer20WidthEq8_sse2
+	%assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	lea r0, [r0-2]            ;pSrc -= 2;
+
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	lea r2, [r2+r3]
+	lea r0, [r0+r1]
+	dec r4
+	jnz near .y_loop
+
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+;*******************************************************************************
+; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
+;												int iHeight,
+;                      );
+;*******************************************************************************
+WELS_EXTERN McHorVer20WidthEq16_sse2
+	%assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	lea r0, [r0-2]            ;pSrc -= 2;
+
+	pxor xmm7, xmm7
+	movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, xmm6
+	psraw xmm0, 5
+	packuswb xmm0, xmm7
+	movq [r2+8], xmm0
+
+	lea r2, [r2+r3]
+	lea r0, [r0+r1]
+	dec r4
+	jnz near .y_loop
+
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+
+;*******************************************************************************
+; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
+;                       int iHeight )
+;*******************************************************************************
+WELS_EXTERN McHorVer02WidthEq8_sse2
+	%assign  push_num 0
+    LOAD_5_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	sub r0, r1
+	sub r0, r1
+
+	WELS_Zero xmm7
+
+	SSE_LOAD_8P xmm0, xmm7, [r0]
+	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm7, [r0]
+	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm7, [r0]
+	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+.start:
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm0, xmm1, [r0]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm3, [r0]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+	dec r4
+	jz near .xx_exit
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm5, [r0]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+	dec r4
+	jz near .xx_exit
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+	jmp near .start
+
+.xx_exit:
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+
+
+;***********************************************************************
+; void McHorVer02Height9Or17_sse2(	const uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
+;						int32_t iWidth,
+;                       int32_t iHeight )
+;***********************************************************************
+WELS_EXTERN McHorVer02Height9Or17_sse2
+	%assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
+
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+
+	shr r4, 3
+	sub r0, r1
+	sub r0, r1
+
+.xloop:
+	WELS_Zero xmm7
+	SSE_LOAD_8P xmm0, xmm7, [r0]
+	SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm7, [r0]
+	SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm7, [r0]
+	SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	movdqa xmm0,xmm1
+	movdqa xmm1,xmm2
+	movdqa xmm2,xmm3
+	movdqa xmm3,xmm4
+	movdqa xmm4,xmm5
+	movdqa xmm5,xmm6
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm6, xmm7, [r0]
+	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm0, xmm1, [r0]
+	FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+	FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm2, xmm3, [r0]
+	FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+	FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	SSE_LOAD_8P xmm4, xmm5, [r0]
+	FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz  near .xx_exit
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	sub r0, r1
+	sub r0, r1
+	add r0, 8
+	add r2, 8
+	jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
+
+
+;***********************************************************************
+; void McHorVer20Width9Or17_sse2(		const uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						int32_t iWidth,
+;						int32_t iHeight
+;                      );
+;***********************************************************************
+WELS_EXTERN McHorVer20Width9Or17_sse2
+	%assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
+	sub r0, 2
+	pxor xmm7, xmm7
+
+	cmp r4, 9
+	jne near .width_17
+
+.yloop_width_9:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [r2], xmm0
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [r2+1], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_9
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
+
+
+.width_17:
+.yloop_width_17:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movq [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	paddw xmm0, [h264_w0x10_1]
+	psraw  xmm0, 5
+	packuswb xmm0, xmm0
+	movd [r2+8], xmm0
+
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6+8]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	paddw xmm2, [h264_w0x10_1]
+	psraw  xmm2, 5
+	packuswb xmm2, xmm2
+	movq [r2+9], xmm2
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_17
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
+
+
+
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+;							(const uint8_t *pSrc,
+;							int32_t iSrcStride,
+;							uint8_t * pTap,
+;							int32_t iTapStride,
+;							int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+WELS_EXTERN McHorVer22HorFirst_sse2
+	%assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
+	pxor xmm7, xmm7
+	sub r0, r1				;;;;;;;;need more 5 lines.
+	sub r0, r1
+
+	cmp r4, 9
+	jne near .width_17
+
+.yloop_width_9:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [r2], xmm0
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [r2+2], xmm2
+	movhps [r2+2+8], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_9
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
+
+
+.width_17:
+.yloop_width_17:
+	movq xmm0, [r0]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3]
+	punpcklbw xmm5, xmm7
+
+	paddw xmm2, xmm3
+	paddw xmm4, xmm5
+	psllw xmm4, 2
+	psubw xmm4, xmm2
+	paddw xmm0, xmm1
+	paddw xmm0, xmm4
+	psllw xmm4, 2
+	paddw xmm0, xmm4
+	movdqa [r2], xmm0
+
+	movq xmm0, [r0+8]
+	punpcklbw xmm0, xmm7
+	movq xmm1, [r0+5+8]
+	punpcklbw xmm1, xmm7
+	movq xmm2, [r0+1+8]
+	punpcklbw xmm2, xmm7
+	movq xmm3, [r0+4+8]
+	punpcklbw xmm3, xmm7
+	movq xmm4, [r0+2+8]
+	punpcklbw xmm4, xmm7
+	movq xmm5, [r0+3+8]
+	punpcklbw xmm5, xmm7
+
+	movdqa xmm7, xmm2
+	paddw   xmm7, xmm3
+	movdqa xmm6, xmm4
+	paddw   xmm6, xmm5
+	psllw xmm6, 2
+	psubw xmm6, xmm7
+	paddw xmm0, xmm1
+	paddw xmm0, xmm6
+	psllw xmm6, 2
+	paddw xmm0, xmm6
+	movd [r2+16], xmm0
+
+
+	pxor  xmm7, xmm7
+	movq xmm0, [r0+6+8]
+	punpcklbw xmm0, xmm7
+
+	paddw xmm4, xmm1
+	paddw xmm5, xmm3
+	psllw xmm5, 2
+	psubw xmm5, xmm4
+	paddw xmm2, xmm0
+	paddw xmm2, xmm5
+	psllw xmm5, 2
+	paddw xmm2, xmm5
+	movq [r2+18], xmm2
+	movhps [r2+18+8], xmm2
+
+	add r0, r1
+	add r2, r3
+	dec r5
+	jnz .yloop_width_17
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
+
+
+%macro FILTER_VER 9
+	paddw  %1, %6
+	movdqa %7, %2
+	movdqa %8, %3
+
+
+	paddw %7, %5
+	paddw %8, %4
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
+	paddw  %8, [h264_mc_hc_32]
+	psraw   %8, 6
+	packuswb %8, %8
+	movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22Width8VerLastAlign_sse2(
+;											const uint8_t *pTap,
+;											int32_t iTapStride,
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
+	%assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+
+	shr r4, 3
+
+.width_loop:
+	movdqa xmm0, [r0]
+	movdqa xmm1, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqa xmm2, [r0]
+	movdqa xmm3, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqa xmm4, [r0]
+	movdqa xmm5, [r0+r1]
+
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	movdqa xmm6, [r0]
+
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm6, [r0]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm7, [r0+r1]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm0, [r0]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm1, [r0+r1]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm2, [r0]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm3, [r0+r1]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqa xmm4, [r0]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqa xmm5, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz near .exit
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	add r0, 16
+	add r2, 8
+	jmp .width_loop
+
+.exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
+
+;***********************************************************************
+;void McHorVer22Width8VerLastUnAlign_sse2(
+;											const uint8_t *pTap,
+;											int32_t iTapStride,
+;											uint8_t * pDst,
+;											int32_t iDstStride,
+;											int32_t iWidth,
+;											int32_t iHeight);
+;***********************************************************************
+
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
+	%assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+	SIGN_EXTENSION	r4, r4d
+	SIGN_EXTENSION	r5, r5d
+%ifndef X86_32
+	push r12
+	push r13
+	push r14
+	mov  r12, r0
+	mov	 r13, r2
+	mov	 r14, r5
+%endif
+	shr r4, 3
+
+.width_loop:
+	movdqu xmm0, [r0]
+	movdqu xmm1, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqu xmm2, [r0]
+	movdqu xmm3, [r0+r1]
+	lea r0, [r0+2*r1]
+	movdqu xmm4, [r0]
+	movdqu xmm5, [r0+r1]
+
+	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	lea r0, [r0+2*r1]
+	movdqu xmm6, [r0]
+
+	movdqa xmm0, xmm1
+	movdqa xmm1, xmm2
+	movdqa xmm2, xmm3
+	movdqa xmm3, xmm4
+	movdqa xmm4, xmm5
+	movdqa xmm5, xmm6
+
+	add r2, r3
+	sub r0, r1
+
+.start:
+	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm6, [r0]
+	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm7, [r0+r1]
+	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm0, [r0]
+	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm1, [r0+r1]
+	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm2, [r0]
+	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm3, [r0+r1]
+	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r0, [r0+2*r1]
+	movdqu xmm4, [r0]
+	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+	dec r5
+	jz near .x_loop_dec
+
+	lea r2, [r2+2*r3]
+	movdqu xmm5, [r0+r1]
+	jmp near .start
+
+.x_loop_dec:
+	dec r4
+	jz near .exit
+%ifdef X86_32
+	mov	r0, arg1
+	mov r2, arg3
+	mov r5, arg6
+%else
+	mov r0, r12
+	mov r2, r13
+	mov r5, r14
+%endif
+	add r0, 16
+	add r2, 8
+	jmp .width_loop
+
+.exit:
+%ifndef X86_32
+	pop r14
+	pop r13
+	pop r12
+%endif
+	POP_XMM
+	LOAD_6_PARA_POP
+	ret
--- /dev/null
+++ b/codec/common/x86/satd_sad.asm
@@ -1,0 +1,2184 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  satd_sad.asm
+;*
+;*  Abstract
+;*      WelsSampleSatd4x4_sse2
+;*      WelsSampleSatd8x8_sse2
+;*      WelsSampleSatd16x8_sse2
+;*      WelsSampleSatd8x16_sse2
+;*      WelsSampleSatd16x16_sse2
+;*
+;*      WelsSampleSad16x8_sse2
+;*      WelsSampleSad16x16_sse2
+;*
+;*  History
+;*      8/5/2009 Created
+;*     24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1:   db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1:   dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1:  dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2:  dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1:   times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+      pxor %1, %1
+      pcmpeqw %2, %2
+      psubw %1, %2
+%endmacro
+
+%macro  SSE2_SumWHorizon1 2
+	movdqa      %2, %1
+	psrldq      %2, 8
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 4
+	paddusw     %1, %2
+	movdqa      %2, %1
+	psrldq      %2, 2
+	paddusw     %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
+   SSE2_SumSub %1, %2, %5
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+	WELS_AbsW %1, %3
+	WELS_AbsW %2, %3
+	WELS_AbsW %4, %6
+	WELS_AbsW %5, %6
+	paddusw       %1, %2
+	paddusw       %4, %5
+	paddusw       %7, %1
+	paddusw       %7, %4
+%endmacro
+
+%macro  SSE2_SumWHorizon 3
+	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
+	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
+	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
+	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
+	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+	lea					r0,    [r0+2*r1]
+    lea					r2,    [r2+2*r3]
+	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+	lea                 r0, [r0+2*r1]
+	lea                 r2, [r2+2*r3]
+	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[r0],[r2]
+	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+    movd      xmm0, [r0]
+    movd      xmm1, [r0+r1]
+    lea       r0 , [r0+2*r1]
+    movd      xmm2, [r0]
+    movd      xmm3, [r0+r1]
+    punpckldq xmm0, xmm2
+    punpckldq xmm1, xmm3
+
+    movd      xmm4, [r2]
+    movd      xmm5, [r2+r3]
+    lea       r2 , [r2+2*r3]
+    movd      xmm6, [r2]
+    movd      xmm7, [r2+r3]
+    punpckldq xmm4, xmm6
+    punpckldq xmm5, xmm7
+
+    pxor      xmm6, xmm6
+    punpcklbw xmm0, xmm6
+    punpcklbw xmm1, xmm6
+    punpcklbw xmm4, xmm6
+    punpcklbw xmm5, xmm6
+
+    psubw     xmm0, xmm4
+    psubw     xmm1, xmm5
+
+    movdqa    xmm2, xmm0
+    paddw     xmm0, xmm1
+    psubw     xmm2, xmm1
+    SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+    movdqa     xmm4, xmm0
+    paddw      xmm0, xmm3
+    psubw      xmm4, xmm3
+
+    movdqa         xmm2, xmm0
+    punpcklwd      xmm0, xmm4
+    punpckhwd      xmm4, xmm2
+
+	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
+	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
+
+    movdqa         xmm7, xmm0
+    paddw          xmm0, xmm5
+    psubw          xmm7, xmm5
+
+	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
+
+    movdqa         xmm2, xmm0
+    paddw          xmm0, xmm1
+    psubw          xmm2, xmm1
+
+    WELS_AbsW  xmm0, xmm3
+    paddusw        xmm6, xmm0
+	WELS_AbsW  xmm2, xmm4
+    paddusw        xmm6, xmm2
+    SSE2_SumWHorizon1  xmm6, xmm4
+	movd           retrd,  xmm6
+    and            retrd,  0xffff
+    shr            retrd,  1
+	POP_XMM
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse2
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+    SSE2_GetSatd8x8
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	POP_XMM
+	LOAD_4_PARA_POP
+	ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse2
+	 %assign  push_num 0
+	 LOAD_4_PARA
+	 PUSH_XMM 8
+	 SIGN_EXTENSION r1, r1d
+	 SIGN_EXTENSION r3, r3d
+	 pxor   xmm6,   xmm6
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
+     lea    r0,    [r0+2*r1]
+     lea    r2,    [r2+2*r3]
+	 SSE2_GetSatd8x8
+
+	 psrlw   xmm6,  1
+	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	 movd    retrd,   xmm6
+	 POP_XMM
+	 LOAD_4_PARA_POP
+	 ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+    add    r0,    8
+    add    r2,    8
+	SSE2_GetSatd8x8
+
+	psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	POP_XMM
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	push r0
+	push r2
+	pxor   xmm6,   xmm6
+    pxor   xmm7,   xmm7
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+	pop r2
+	pop r0
+	add    r0,    8
+	add    r2,    8
+
+	SSE2_GetSatd8x8
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+    psrlw   xmm6,  1
+	SSE2_SumWHorizon   xmm6,xmm4,xmm7
+	movd    retrd,   xmm6
+	POP_XMM
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+	paddd        xmm4, %1 ;for dc
+	paddd        xmm4, %3 ;for dc
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+	pmaddubsw    %1, xmm5
+	movdqa       %2, %1
+	pmaddwd      %1, xmm7
+	pmaddwd      %2, xmm6
+	movdqa       %3, %1
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
+	movdqa       %3, %1
+	punpcklqdq   %1, %2
+	punpckhqdq   %3, %2
+;    paddd        xmm4, %1 ;for dc
+;	 paddd        xmm4, %3 ;for dc
+	movdqa       %4, %1
+	punpcklqdq   %4, %3
+	packssdw     %1, %3
+	psllw        %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+	pxor        xmm7,   xmm7
+	movq        xmm0,   [eax]
+	movq        xmm1,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	movq        xmm2,   [eax]
+	movq        xmm3,   [eax+ebx]
+	lea         eax,    [eax+2*ebx]
+	punpcklbw   xmm0,   xmm7
+	punpcklbw   xmm1,   xmm7
+	punpcklbw   xmm2,   xmm7
+	punpcklbw   xmm3,   xmm7
+	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm7
+	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm7
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+	;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2],   0
+	pinsrw      xmm0,   word[esi+%2+8], 4
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+2],  0
+	pinsrw      xmm0,   word[esi+%2+10], 4
+	psubsw      xmm0,   xmm1
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+4],  0
+	pinsrw      xmm0,   word[esi+%2+12], 4
+	psubsw      xmm0,   xmm3
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+	pxor        xmm0,   xmm0
+	pinsrw      xmm0,   word[esi+%2+6],  0
+	pinsrw      xmm0,   word[esi+%2+14], 4
+	psubsw      xmm0,   xmm2
+	pabsw       xmm0,   xmm0
+	paddw       xmm4,   xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH  3
+	movq        xmm0,   [esi+%3+8*%1]
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm5,   xmm0
+	pabsw       xmm1,   xmm1
+	pabsw       xmm2,   xmm2
+	pabsw       xmm3,   xmm3
+	paddw       xmm2,   xmm1;for DC
+	paddw       xmm2,   xmm3;for DC
+	paddw       xmm5,   xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+	pxor        xmm0,   xmm0
+	movq2dq     xmm0,   mm4
+	punpcklqdq  xmm0,   xmm0
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+	shl         %1,     4
+	movdqa      xmm0,   [esi+32+%1]
+	psubsw      xmm0,   xmm7
+	pabsw       xmm0,   xmm0
+	paddw       xmm6,   xmm0
+	paddw       xmm6,   xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 32
+	SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+	SSE41_GetX38x4SatdDec
+	SSE41_GetX38x4SatdV   %1, %2
+	SSE41_GetX38x4SatdH   %1, %2, 16
+	SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	pxor        xmm4,   xmm4
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movdqu 		xmm0,   [ecx]
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi],  xmm0 ;V
+	movdqa      [esi+16], xmm1
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     8
+	pinsrb      xmm0,   byte[ecx+edx-1], 9
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     10
+	pinsrb      xmm0,   byte[ecx+edx-1], 11
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     12
+	pinsrb      xmm0,   byte[ecx+edx-1], 13
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     14
+	pinsrb      xmm0,   byte[ecx+edx-1], 15
+	movhlps		xmm1,   xmm0
+	punpcklqdq  xmm0,   xmm0
+	punpcklqdq  xmm1,   xmm1
+	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+	movdqa      [esi+32], xmm0 ;H
+	movdqa      [esi+48], xmm1
+	movd        ecx,    xmm4 ;dc
+	add         ecx,    16   ;(sum+16)
+	shr         ecx,    5    ;((sum+16)>>5)
+	shl         ecx,    4    ;
+	movd        mm4,    ecx  ; mm4 copy DC
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+	mov         edi,    0
+.loop16x16_get_satd:
+.loopStart1:
+	SSE41_I16x16GetX38x4Satd ecx, edi
+	inc          ecx
+	cmp         ecx, 4
+	jl          .loopStart1
+	cmp         edi, 16
+	je          .loop16x16_get_satd_end
+	mov         eax, [esp+24]
+	add         eax, 8
+	mov         ecx, 0
+	add         edi, 16
+	jmp         .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov      edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ebx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_16x16
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16
+
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_16x16
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+return_satd_intra_16x16_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+	movdqa      xmm5,   [HSumSubDB1]
+	movdqa      xmm6,   [HSumSubDW1]
+	movdqa      xmm7,   [PDW1]
+	sub         ecx,    edx
+	movq 		xmm0,   [ecx]
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+	movdqa      [esi],  xmm0 ;V
+	add         ecx,    edx
+	pinsrb      xmm0,   byte[ecx-1], 0
+	pinsrb      xmm0,   byte[ecx+edx-1], 1
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     2
+	pinsrb      xmm0,   byte[ecx+edx-1], 3
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     4
+	pinsrb      xmm0,   byte[ecx+edx-1], 5
+	lea         ecx,    [ecx+2*edx]
+	pinsrb      xmm0,   byte[ecx-1],     6
+	pinsrb      xmm0,   byte[ecx+edx-1], 7
+	punpcklqdq  xmm0,   xmm0
+	SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+	movdqa      [esi+16], xmm0 ;H
+;(sum+2)>>2
+	movdqa      xmm6,   [PDQ2]
+	movdqa      xmm5,   xmm4
+	punpckhqdq  xmm5,   xmm1
+	paddd       xmm5,   xmm6
+	psrld       xmm5,   2
+;(sum1+sum2+4)>>3
+	paddd       xmm6,   xmm6
+	paddd       xmm4,   xmm1
+	paddd       xmm4,   xmm6
+	psrld       xmm4,   3
+;satd *16
+	pslld       xmm5,   4
+	pslld       xmm4,   4
+;temp satd
+	movdqa      xmm6,   xmm4
+	punpcklqdq  xmm4,   xmm5
+	psllq       xmm4,   32
+	psrlq       xmm4,   32
+	movdqa      [esi+32], xmm4
+	punpckhqdq  xmm5,   xmm6
+	psllq       xmm5,   32
+	psrlq       xmm5,   32
+	movdqa      [esi+48], xmm5
+
+	pxor        xmm4,   xmm4 ;V
+	pxor        xmm5,   xmm5 ;H
+	pxor        xmm6,   xmm6 ;DC
+	mov         ecx,    0
+loop_chroma_satdx3_cb_cr:
+	SSE41_ChromaGetX38x4Satd ecx, 0
+	inc             ecx
+	cmp             ecx, 2
+	jl              loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+	movq2dq     %1, %3
+	movq2dq     %2, %4
+	punpcklqdq  %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+	mov    esi,    [esp+40] ;temp_satd
+	xor    edi,    edi
+loop_chroma_satdx3:
+	SSE41_ChromaGetX38x8Satd
+	cmp             edi, 1
+	je              loop_chroma_satdx3end
+	inc             edi
+	SSEReg2MMX  xmm4, mm0,mm1
+	SSEReg2MMX  xmm5, mm2,mm3
+	SSEReg2MMX  xmm6, mm5,mm6
+	mov         ecx,  [esp+44]
+	mov         eax,  [esp+48]
+	jmp         loop_chroma_satdx3
+loop_chroma_satdx3end:
+	MMXReg2SSE  xmm0, xmm3, mm0, mm1
+	MMXReg2SSE  xmm1, xmm3, mm2, mm3
+	MMXReg2SSE  xmm2, xmm3, mm5, mm6
+
+	paddw       xmm4, xmm0
+	paddw       xmm5, xmm1
+	paddw       xmm6, xmm2
+
+	MMX_DW_1_2REG    xmm0, xmm1
+	psrlw       xmm4, 1 ;/2
+	psrlw       xmm5, 1 ;/2
+	psrlw       xmm6, 1 ;/2
+	SSE41_HSum8W     xmm4, xmm0, xmm1
+	SSE41_HSum8W     xmm5, xmm0, xmm1
+	SSE41_HSum8W     xmm6, xmm0, xmm1
+	; comparing order: DC H V
+	movd      ebx, xmm6 ;DC
+	movd      edi, xmm5 ;H
+	movd      ecx, xmm4 ;V
+	mov       edx, [esp+36]
+	shl       edx, 1
+	add       edi, edx
+	add       ecx, edx
+	mov       edx, [esp+32]
+	cmp       ebx, edi
+	jge near   not_dc_8x8
+	cmp        ebx, ecx
+	jge near   not_dc_h_8x8
+
+	; for DC mode
+	mov       dword[edx], 0;I8_PRED_DC
+	mov       eax, ebx
+	jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+	; for H mode
+	cmp       edi, ecx
+	jge near   not_dc_h_8x8
+	mov       dword[edx], 1;I8_PRED_H
+	mov       eax, edi
+	jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+	; for V mode
+	mov       dword[edx], 2;I8_PRED_V
+	mov       eax, ecx
+return_satd_intra_8x8_x3:
+	WELSEMMS
+	pop         edi
+	pop         esi
+	pop         ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
+  movdqa      %1,  xmm6
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
+  psadbw      xmm6,%2
+  paddw       xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+    movzx   %2, byte %1
+    mov    %3, %2
+    add     %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+	push   ebx
+	push   esi
+	push   edi
+	mov    ecx,    [esp+16]
+	mov    edx,    [esp+20]
+	mov    edi,    [esp+40] ;temp_sad
+	sub    ecx,    edx
+    movdqa      xmm5,[ecx]
+    pxor        xmm0,xmm0
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
+    movd        eax,xmm0
+
+    add         ecx,edx
+    lea         ebx, [edx+2*edx]
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    lea         ecx, [ecx+4*edx]
+    add         edi, 64
+    WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
+    WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
+    WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+    WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
+    sub        edi, 192
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
+    pxor        xmm1,xmm1
+    pshufb      xmm7,xmm1
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
+	mov    eax,    [esp+24]
+	mov    ebx,    [esp+28]
+    lea         esi, [ebx+2*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+    add         edi, 64
+    lea         eax, [eax+4*ebx]
+    SSSE3_Get16BSadHVDC [edi], [eax]
+    SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+    SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+    SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
+    movhlps     xmm0,xmm4
+    paddw       xmm4,xmm0
+; comparing order: DC H V
+	movd        ebx, xmm4 ;DC
+	movd        ecx, xmm3 ;V
+	psrldq      xmm3, 4
+	movd        esi, xmm3 ;H
+	mov         eax, [esp+36] ;lamda
+	shl         eax, 1
+	add         esi, eax
+	add         ebx, eax
+	mov         edx, [esp+32]
+	cmp         ebx, esi
+	jge near   not_dc_16x16_sad
+	cmp        ebx, ecx
+	jge near   not_dc_h_16x16_sad
+	; for DC mode
+	mov       dword[edx], 2;I16_PRED_DC
+	mov       eax, ebx
+    sub        edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm7
+%assign x x+1
+%endrep
+	jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+	; for H mode
+	cmp       esi, ecx
+	jge near   not_dc_h_16x16_sad
+	mov       dword[edx], 1;I16_PRED_H
+	mov       eax, esi
+	jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+	; for V mode
+	mov       dword[edx], 0;I16_PRED_V
+	mov       eax, ecx
+    sub       edi, 192
+%assign x 0
+%rep 16
+    movdqa    [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+	pop    edi
+	pop    esi
+	pop    ebx
+	ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+	movq             xmm0, [r0]
+	punpcklqdq       xmm0, xmm0
+	pmaddubsw        xmm0, xmm7
+	movq             xmm1, [r0+r1]
+	punpcklqdq       xmm1, xmm1
+	pmaddubsw        xmm1, xmm7
+	movq             xmm2, [r2]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r2+r3]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	psubsw           xmm0, xmm2
+	psubsw           xmm1, xmm3
+	movq             xmm2, [r0+2*r1]
+	punpcklqdq       xmm2, xmm2
+	pmaddubsw        xmm2, xmm7
+	movq             xmm3, [r0+r4]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
+	movq             xmm4, [r2+2*r3]
+	punpcklqdq       xmm4, xmm4
+	pmaddubsw        xmm4, xmm7
+	movq             xmm5, [r2+r5]
+	punpcklqdq       xmm5, xmm5
+	pmaddubsw        xmm5, xmm7
+	psubsw           xmm2, xmm4
+	psubsw           xmm3, xmm5
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
+	pabsw            xmm0, xmm0
+	pabsw            xmm2, xmm2
+	pabsw            xmm1, xmm1
+	pabsw            xmm3, xmm3
+	movdqa           xmm4, xmm3
+	pblendw          xmm3, xmm1, 0xAA
+	pslld            xmm1, 16
+	psrld            xmm4, 16
+	por              xmm1, xmm4
+	pmaxuw           xmm1, xmm3
+	paddw            xmm6, xmm1
+	movdqa           xmm4, xmm0
+	pblendw          xmm0, xmm2, 0xAA
+	pslld            xmm2, 16
+	psrld            xmm4, 16
+	por              xmm2, xmm4
+	pmaxuw           xmm0, xmm2
+	paddw            xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[r2+r3*2]
+	lea         r2, [r3*2+r2]
+	movd        xmm5,[r2+r3]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[r0+r1*2]
+	lea         r0, [r1*2+r0]
+	movd        xmm5,[r0+r1]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
+	SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6, xmm6
+	SSE41_GetSatd8x4
+	lea			r0,	 [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor        xmm6, xmm6
+	mov         r6,    0
+loop_get_satd_8x16:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_8x16
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+	%assign  push_num 2
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+
+	pop  r2
+	pop  r0
+	add			r0,    8
+	add			r2,    8
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	SSE41_GetSatd8x4
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+%ifdef X86_32
+	push  r4
+	push  r5
+	push  r6
+%endif
+	%assign  push_num 3
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+
+	push  r0
+	push  r2
+
+	movdqa      xmm7, [HSumSubDB1]
+	lea         r4,  [r1+r1*2]
+	lea         r5,  [r3+r3*2]
+	pxor		xmm6,   xmm6
+	mov         r6,    0
+loop_get_satd_16x16_left:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_left
+
+	pop  r2
+	pop  r0
+	add			r0,    8
+	add			r2,    8
+	mov         r6,    0
+loop_get_satd_16x16_right:
+	SSE41_GetSatd8x4
+	lea			r0,  [r0+4*r1]
+	lea			r2,  [r2+4*r3]
+	inc         r6
+	cmp         r6,  4
+	jl          loop_get_satd_16x16_right
+	SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+	POP_XMM
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r6
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqu xmm1,   [r2]
+	MOVDQ  xmm2,   [r0];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	paddw  xmm7,   xmm0
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+2*r3]
+	MOVDQ  xmm2,   [r0+2*r1];[eax] must aligned 16
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+	movdqu xmm1,   [r2+r5]
+	MOVDQ  xmm2,   [r0+r4]
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+	movq   xmm0,   [r0]
+	movq   xmm1,   [r0+r1]
+	lea    r0,     [r0+2*r1]
+	movhps xmm0,   [r0]
+	movhps xmm1,   [r0+r1]
+
+	movq   xmm2,   [r2]
+	movq   xmm3,   [r2+r3]
+	lea    r2,     [r2+2*r3]
+	movhps xmm2,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+%ifdef X86_32
+	push  r4
+	push  r5
+%endif
+
+	%assign  push_num 2
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	lea r4, [3*r1]
+	lea r5, [3*r3]
+
+	pxor   xmm7,   xmm7
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	lea	   r0,  [r0+4*r1]
+	lea	   r2,  [r2+4*r3]
+	SSE2_GetSad4x16
+	movhlps xmm0, xmm7
+	paddw xmm0, xmm7
+	movd retrd, xmm0
+	POP_XMM
+	LOAD_4_PARA_POP
+%ifdef X86_32
+	pop  r5
+	pop  r4
+%endif
+	ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+	%assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	movdqu xmm0,   [r2]
+	MOVDQ  xmm2,   [r0]
+	psadbw xmm0,   xmm2
+	movdqu xmm1,   [r2+r3]
+	MOVDQ  xmm2,   [r0+r1]
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
+
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+	SSE2_GetSad2x16
+
+	movhlps     xmm1, xmm0
+	paddw       xmm0, xmm1
+	movd        retrd,  xmm0
+	LOAD_4_PARA_POP
+	ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 7
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+    pxor   xmm6,   xmm6
+
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	SSE2_GetSad8x4
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	POP_XMM
+	LOAD_4_PARA_POP
+	ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+	%assign  push_num 0
+	mov		r2,  arg3
+	push	r2
+	CACHE_SPLIT_CHECK r2, 8, 64
+	jle    near   .pixel_sad_8x8_nsplit
+	pop		r2
+%ifdef X86_32
+	push	r3
+	push	r4
+	push	r5
+%endif
+	%assign  push_num 3
+	PUSH_XMM 8
+	mov		r0,  arg1
+	mov		r1,  arg2
+	SIGN_EXTENSION r1, r1d
+    pxor   xmm7,   xmm7
+
+    ;ecx r2, edx r4, edi r5
+
+    mov    r5,    r2
+    and    r5,    0x07
+    sub    r2,    r5
+    mov    r4,    8
+    sub    r4,    r5
+
+    shl    r5,    3
+    shl    r4,    3
+    movd   xmm5,   r5d
+    movd   xmm6,   r4d
+	mov    r5,    8
+	add    r5,    r2
+    mov    r3,    arg4
+	SIGN_EXTENSION r3, r3d
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	lea    r5,    [r5+2*r3]
+
+    movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+
+	movq   xmm1,   [r2]
+	movq   xmm2,   [r5]
+	movhps xmm1,   [r2+r3]
+	movhps xmm2,   [r5+r3]
+	psrlq  xmm1,   xmm5
+	psllq  xmm2,   xmm6
+	por    xmm1,   xmm2
+
+	psadbw xmm0,   xmm1
+	paddw  xmm7,   xmm0
+
+    movhlps    xmm0, xmm7
+	paddw      xmm0, xmm7
+	movd       retrd,  xmm0
+	POP_XMM
+%ifdef X86_32
+	pop	 r5
+	pop	 r4
+	pop	 r3
+%endif
+	jmp        .return
+
+.pixel_sad_8x8_nsplit:
+
+	pop r2
+	%assign  push_num 0
+	LOAD_4_PARA
+	PUSH_XMM 7
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor   xmm6,   xmm6
+	SSE2_GetSad8x4
+    lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+    SSE2_GetSad8x4
+    movhlps    xmm0, xmm6
+	paddw      xmm0, xmm6
+	movd       retrd,  xmm0
+	POP_XMM
+	LOAD_4_PARA_POP
+.return:
+	ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+	psadbw %1,   %4
+	paddw  xmm5, %1
+	psadbw %4,   %3
+	paddw  xmm4, %4
+	movdqu %4,   [%5-1]
+	psadbw %4,   %2
+	paddw  xmm6, %4
+	movdqu %4,   [%5+1]
+	psadbw %4,   %2
+	paddw  xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+	%assign  push_num 0
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw  xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw  xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw  xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm2,   xmm3
+	paddw xmm5,   xmm2
+
+	movdqu xmm2,   [r2-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+	%assign  push_num 0
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movdqa xmm0,   [r0]
+	sub    r2,    r3
+	movdqu xmm3,   [r2]
+	psadbw xmm3,   xmm0
+	paddw xmm4,   xmm3
+
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm3,   xmm1
+	paddw xmm4,   xmm3
+
+	movdqu xmm2,   [r2+r3-1]
+	psadbw xmm2,   xmm0
+	paddw xmm6,   xmm2
+
+	movdqu xmm3,   [r2+r3+1]
+	psadbw xmm3,   xmm0
+	paddw xmm7,   xmm3
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm2,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+	movdqa xmm0,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm1,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+	movdqa xmm2,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movdqa xmm0,   [r0]
+	movdqu xmm3,   [r2]
+	SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+	movdqa xmm1,   [r0+r1]
+	movdqu xmm3,   [r2+r3]
+	SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+	lea    r2,    [r2+2*r3]
+	movdqu xmm3,   [r2]
+	psadbw xmm0,   xmm3
+	paddw xmm5,   xmm0
+
+	movdqu xmm0,   [r2-1]
+	psadbw xmm0,   xmm1
+	paddw xmm6,   xmm0
+
+	movdqu xmm3,   [r2+1]
+	psadbw xmm3,   xmm1
+	paddw xmm7,   xmm3
+
+	movdqu xmm3,   [r2+r3]
+	psadbw xmm1,   xmm3
+	paddw xmm5,   xmm1
+
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+	%assign  push_num 0
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+	%assign  push_num 0
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
+	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
+	pxor   xmm6,   xmm6    ;sad pRefMb-1
+	pxor   xmm7,   xmm7    ;sad pRefMb+1
+	movq   xmm0,   [r0]
+	movhps xmm0,   [r0+r1]
+	sub    r2,    r3
+	movq   xmm3,   [r2]
+	movhps xmm3,   [r2+r3]
+	psadbw xmm3,   xmm0
+	paddw  xmm4,   xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movq   xmm0,  [r0]
+	movhps xmm0,  [r0+r1]
+	psadbw xmm3,  xmm0
+	paddw  xmm4,  xmm3
+
+
+	movq   xmm1,  [r2+r3-1]
+	movq   xmm3,  [r2+r3+1]
+
+	lea    r0,    [r0+2*r1]
+	lea    r2,    [r2+2*r3]
+	movhps xmm1,  [r2-1]
+	movhps xmm3,  [r2+1]
+
+	psadbw xmm1,  xmm0
+	paddw  xmm6,  xmm1
+	psadbw xmm3,  xmm0
+	paddw  xmm7,  xmm3
+
+	movq   xmm3,  [r2]
+	movhps xmm3,  [r2+r3]
+	psadbw xmm0,  xmm3
+	paddw  xmm5,  xmm0
+
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	movhlps    xmm0, xmm5
+	paddw      xmm5, xmm0
+	movhlps    xmm0, xmm6
+	paddw      xmm6, xmm0
+	movhlps    xmm0, xmm7
+	paddw      xmm7, xmm0
+	punpckldq  xmm4, xmm5
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6
+	movdqa     [r4],xmm4
+	POP_XMM
+	LOAD_5_PARA_POP
+	ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+	%assign  push_num 0
+	LOAD_5_PARA
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	movd   xmm0,   [r0]
+	movd   xmm1,   [r0+r1]
+	lea        r0,    [r0+2*r1]
+	movd       xmm2,   [r0]
+	movd       xmm3,   [r0+r1]
+	punpckldq  xmm0, xmm1
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm0, xmm2
+	sub        r2,  r3
+	movd       xmm1, [r2]
+	movd       xmm2, [r2+r3]
+	punpckldq  xmm1, xmm2
+	movd       xmm2, [r2+r3-1]
+	movd       xmm3, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+
+	movd       xmm4, [r2]
+	movd       xmm5, [r2-1]
+	punpckldq  xmm2, xmm5
+	movd       xmm5, [r2+1]
+	punpckldq  xmm3, xmm5
+
+	movd       xmm5, [r2+r3]
+	punpckldq  xmm4, xmm5
+
+	punpcklqdq xmm1, xmm4 ;-L
+
+	movd       xmm5, [r2+r3-1]
+	movd       xmm6, [r2+r3+1]
+
+	lea        r2,  [r2+2*r3]
+	movd       xmm7, [r2-1]
+	punpckldq  xmm5, xmm7
+	punpcklqdq xmm2, xmm5 ;-1
+	movd       xmm7, [r2+1]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm3, xmm6 ;+1
+	movd       xmm6, [r2]
+	movd       xmm7, [r2+r3]
+	punpckldq  xmm6, xmm7
+	punpcklqdq xmm4, xmm6 ;+L
+	psadbw     xmm1, xmm0
+	psadbw     xmm2, xmm0
+	psadbw     xmm3, xmm0
+	psadbw     xmm4, xmm0
+
+	movhlps    xmm0, xmm1
+	paddw      xmm1, xmm0
+	movhlps    xmm0, xmm2
+	paddw      xmm2, xmm0
+	movhlps    xmm0, xmm3
+	paddw      xmm3, xmm0
+	movhlps    xmm0, xmm4
+	paddw      xmm4, xmm0
+	punpckldq  xmm1, xmm4
+	punpckldq  xmm2, xmm3
+	punpcklqdq xmm1, xmm2
+	movdqa     [r4],xmm1
+	LOAD_5_PARA_POP
+	ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;   int32_t WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WELS_EXTERN WelsSampleSad4x4_mmx
+    %assign  push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENSION r1, r1d
+	SIGN_EXTENSION r3, r3d
+	movd	  mm0, [r0]
+	movd	  mm1, [r0+r1]
+	punpckldq mm0, mm1
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm0, mm3
+
+	lea       r0, [r0+2*r1]
+	lea       r2, [r2+2*r3]
+
+	movd      mm1, [r0]
+	movd      mm2, [r0+r1]
+	punpckldq mm1, mm2
+
+	movd      mm3, [r2]
+	movd      mm4, [r2+r3]
+	punpckldq mm3, mm4
+	psadbw    mm1, mm3
+	paddw     mm0, mm1
+
+    movd      retrd, mm0
+
+	WELSEMMS
+    LOAD_4_PARA_POP
+    ret
--- /dev/null
+++ b/codec/common/x86/vaa.asm
@@ -1,0 +1,411 @@
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
+;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [r0    ]	; line 0
+	movdqa %2, [r0+r1]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [r0+r2]	; line 2
+	movdqa %4, [r0+r3]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $04
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [r0    ]	; line 0
+	movdqa %2, [r0+r1]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [r0+r2]	; line 2
+	movdqa %4, [r0+r3]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $04
+%endmacro
+
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+; , 6/7/2010
+
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1,r1d
+
+%ifdef X86_32
+    push r3
+    push r4
+    push r5
+    push r6
+    %assign push_num push_num+4
+%endif
+
+    mov  r5,r7
+    and  r5,0fh
+    sub  r7,r5
+    sub  r7,32
+
+
+    mov r2,r1
+    sal r2,$01   ;r2 = 2*iLineSize
+    mov r3,r2
+    add r3,r1   ;r3 = 3*iLineSize
+
+    mov r4,r2
+    sal r4,$01   ;r4 = 4*iLineSize
+
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7], xmm0
+
+	lea r0, [r0+r4]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7+8], xmm0
+
+	lea r0, [r0+r4]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7+16], xmm0
+
+	lea r0, [r0+r4]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [r7+24], xmm0
+
+	movdqa xmm0, [r7]		; block 0~7
+	movdqa xmm1, [r7+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+
+
+	movd r2d, xmm0
+	and r2, 0ffffh		; effective low work truncated
+	mov r3, r2
+	imul r2, r3
+	sar r2, $04
+	movd retrd, xmm1
+	sub retrd, r2d
+
+	add r7,32
+	add r7,r5
+
+%ifdef X86_32
+	pop r6
+	pop r5
+	pop r4
+	pop r3
+%endif
+	POP_XMM
+
+	ret
+
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+
+    %assign push_num 0
+    LOAD_2_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION r1,r1d
+
+%ifdef X86_32
+    push r3
+    push r4
+    push r5
+    push r6
+    %assign push_num push_num+4
+%endif
+
+    mov  r5,r7
+    and  r5,0fh
+    sub  r7,r5
+    sub  r7,32
+
+
+    mov r2,r1
+    sal r2,$01   ;r2 = 2*iLineSize
+    mov r3,r2
+    add r3,r1   ;r3 = 3*iLineSize
+
+    mov r4,r2
+    sal r4,$01   ;r4 = 4*iLineSize
+
+	pxor xmm7, xmm7
+
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7],xmm0
+
+	lea r0,[r0+r4]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    movq [r7+8],xmm1
+
+
+	lea r0,[r0+r4]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+    movq [r7+16],xmm0
+
+	lea r0,[r0+r4]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+    movq [r7+24],xmm1
+
+
+	movdqa xmm0,[r7]
+	movdqa xmm1,[r7+16]
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+
+
+    movd r2d, xmm0
+    and r2, 0ffffh          ; effective low work truncated
+    mov r3, r2
+    imul r2, r3
+    sar r2, $04
+    movd retrd, xmm1
+	sub retrd, r2d
+
+	add r7,32
+	add r7,r5
+%ifdef X86_32
+	pop r6
+	pop r5
+	pop r4
+	pop r3
+%endif
+	POP_XMM
+
+	ret
+
+;***********************************************************************
+;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;***********************************************************************
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
+	%assign push_num 0
+	LOAD_1_PARA
+	movdqa xmm0,[r0]
+	pshufd xmm1, xmm0, 01Bh
+	paddd xmm1, xmm0
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	psrad xmm1, 02h		; iAverageSad
+	movdqa xmm2, xmm1
+	psrad xmm2, 06h
+	movdqa xmm3, xmm0	; iSadBlock
+	psrad xmm3, 06h
+	psubd xmm3, xmm2
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
+	pshufd xmm4, xmm3, 01Bh
+	paddd xmm4, xmm3
+	pshufd xmm3, xmm4, 0B1h
+	paddd xmm3, xmm4
+	movd r0d, xmm3
+	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
+
+	jb near .threshold_exit
+	pshufd xmm0, xmm0, 01Bh
+	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
+	movmskps retrd, xmm0
+	ret
+.threshold_exit:
+	mov retrd, 15
+	ret
+
+;***********************************************************************
+;	uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;***********************************************************************
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
+	%assign push_num 0
+	LOAD_1_PARA
+	movdqa xmm0, [r0]
+	pshufd xmm1, xmm0, 01Bh
+	paddd xmm1, xmm0
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	psrad xmm1, 02h		; iAverageSad
+	movdqa xmm2, xmm1
+	psrad xmm2, 06h
+	movdqa xmm3, xmm0	; iSadBlock
+	psrad xmm3, 06h
+	psubd xmm3, xmm2
+
+	; to replace pmulld functionality as below
+	movdqa xmm2, xmm3
+	pmuludq xmm2, xmm3
+	pshufd xmm4, xmm3, 0B1h
+	pmuludq xmm4, xmm4
+	movdqa xmm5, xmm2
+	punpckldq xmm5, xmm4
+	punpckhdq xmm2, xmm4
+	punpcklqdq xmm5, xmm2
+
+	pshufd xmm4, xmm5, 01Bh
+	paddd xmm4, xmm5
+	pshufd xmm5, xmm4, 0B1h
+	paddd xmm5, xmm4
+
+	movd r0d, xmm5
+	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
+	jb near .threshold_exit
+	pshufd xmm0, xmm0, 01Bh
+	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
+	movmskps retrd, xmm0
+	ret
+.threshold_exit:
+	mov retrd, 15
+	ret
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -52,7 +52,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+				AdditionalIncludeDirectories="../../../common/inc;../../interface;../../src/common"
 				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -137,7 +137,7 @@
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
-				AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+				AdditionalIncludeDirectories="../../../common/inc;../../interface;../../src/common"
 				PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
@@ -224,7 +224,7 @@
 				Optimization="3"
 				EnableIntrinsicFunctions="false"
 				FavorSizeOrSpeed="1"
-				AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+				AdditionalIncludeDirectories="../../../common/inc/;../../interface;../../src/common"
 				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="false"
@@ -314,7 +314,7 @@
 				Optimization="3"
 				EnableIntrinsicFunctions="false"
 				FavorSizeOrSpeed="1"
-				AdditionalIncludeDirectories="../../../common/;../../interface;../../src/common"
+				AdditionalIncludeDirectories="../../../common/inc/;../../interface;../../src/common"
 				PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
 				RuntimeLibrary="0"
 				EnableFunctionLevelLinking="false"
@@ -380,7 +380,7 @@
 			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
 			>
 			<File
-				RelativePath="..\..\..\common\cpu.cpp"
+				RelativePath="..\..\..\common\src\cpu.cpp"
 				>
 			</File>
 			<File
@@ -388,7 +388,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\WelsThreadLib.cpp"
+				RelativePath="..\..\..\common\src\WelsThreadLib.cpp"
 				>
 			</File>
 			<File
@@ -446,7 +446,7 @@
 			Name="Header Files"
 			>
 			<File
-				RelativePath="..\..\..\common\cpu.h"
+				RelativePath="..\..\..\common\inc\cpu.h"
 				>
 			</File>
 			<File
@@ -454,7 +454,7 @@
 				>
 			</File>
 			<File
-				RelativePath="..\..\..\common\WelsThreadLib.h"
+				RelativePath="..\..\..\common\inc\WelsThreadLib.h"
 				>
 			</File>
 			<File
@@ -474,7 +474,7 @@
 			Name="ASM"
 			>
 			<File
-				RelativePath="..\..\..\common\cpuid.asm"
+				RelativePath="..\..\..\common\x86\cpuid.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -490,7 +490,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -508,7 +508,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -521,7 +521,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -530,7 +530,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -539,7 +539,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -548,7 +548,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -561,7 +561,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -570,7 +570,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -579,7 +579,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -588,13 +588,13 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
 			</File>
 			<File
-				RelativePath="..\..\..\common\satd_sad.asm"
+				RelativePath="..\..\..\common\x86\satd_sad.asm"
 				>
 				<FileConfiguration
 					Name="Debug|Win32"
@@ -601,7 +601,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -610,7 +610,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -619,7 +619,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX  -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -628,7 +628,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -641,7 +641,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -650,7 +650,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -659,7 +659,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>
@@ -668,7 +668,7 @@
 					>
 					<Tool
 						Name="VCCustomBuildTool"
-						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
+						CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/x86/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)&#x0D;&#x0A;"
 						Outputs="$(IntDir)\$(InputName).obj"
 					/>
 				</FileConfiguration>