ref: ec84f4bcc90d6aa447860eba8235420edc79e41f
parent: 3958118bf03c92aa547dbe3c77c5557ed4ad944b
author: volvet <[email protected]>
date: Fri Jan 3 09:49:45 EST 2014
resolve conflict
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@
CP=cp
ROOTDIR=$(PWD)
+
ifeq (,$(wildcard ./gtest))
HAVE_GTEST=No
else
@@ -13,20 +14,22 @@
# Configurations
ifeq ($(BUILDTYPE), Release)
CFLAGS += -O3
-ifneq ($(ENABLE64BIT), Yes)
USE_ASM = Yes
-endif
else
CFLAGS = -g
USE_ASM = No
endif
+
ifeq ($(ENABLE64BIT), Yes)
CFLAGS += -m64
LDFLAGS += -m64
+ASMFLAGS += -DUNIX64
else
CFLAGS += -m32
LDFLAGS += -m32
+ASMFLAGS += -DX86_32
endif
+
include build/platform-$(UNAME).mk
ifeq ($(USE_ASM),Yes)
@@ -40,7 +43,8 @@
#### No user-serviceable parts below this line
INCLUDES = -Icodec/api/svc -Icodec/common -Igtest/include
-ASM_INCLUDES = -Iprocessing/src/asm/
+#ASM_INCLUDES = -Iprocessing/src/asm/
+ASM_INCLUDES = -Icodec/common/
COMMON_INCLUDES = \
-Icodec/decoder/core/inc
@@ -83,7 +87,7 @@
include codec/common/targets.mk
include codec/decoder/targets.mk
include codec/encoder/targets.mk
-include processing/targets.mk
+include codec/processing/targets.mk
include codec/console/dec/targets.mk
include codec/console/enc/targets.mk
--- a/build/mktargets.sh
+++ b/build/mktargets.sh
@@ -2,7 +2,7 @@
(cd codec/decoder; python ../../build/mktargets.py --directory codec/decoder --library decoder --exclude StdAfx.cpp)
(cd codec/encoder; python ../../build/mktargets.py --directory codec/encoder --library encoder --exclude DllEntry.cpp)
(cd codec/common; python ../../build/mktargets.py --directory codec/common --library common)
-(cd processing; python ../build/mktargets.py --directory processing --library processing --exclude wels_process.cpp --exclude WelsVideoProcessor.cpp)
+(cd codec/processing; python ../../build/mktargets.py --directory codec/processing --library processing --exclude wels_process.cpp --exclude WelsVideoProcessor.cpp)
(cd codec/console/dec; python ../../../build/mktargets.py --directory codec/console/dec --binary h264dec --exclude dec_console.h --exclude load_bundle_functions.cpp)
(cd codec/console/enc; python ../../../build/mktargets.py --directory codec/console/enc --binary h264enc --exclude enc_console.h --exclude bundlewelsenc.cpp)
--- a/build/platform-darwin.mk
+++ b/build/platform-darwin.mk
@@ -1,5 +1,11 @@
-USE_ASM = No # We don't have ASM working on Mac yet
+
ASM = nasm
CFLAGS += -Werror -fPIC
LDFLAGS += -lpthread
-ASMFLAGS += -f macho --prefix _ -DNOPREFIX
+ASMFLAGS += --prefix _ -DNOPREFIX
+ifeq ($(ENABLE64BIT), Yes)
+ASMFLAGS += -f macho64
+else
+ASMFLAGS += -f macho
+endif
+
--- a/build/platform-linux.mk
+++ b/build/platform-linux.mk
@@ -1,5 +1,10 @@
ASM = nasm
CFLAGS += -Werror -fPIC -DLINUX -D__NO_CTYPE
LDFLAGS += -lpthread
-ASMFLAGS += -f elf -DNOPREFIX
+ASMFLAGS += -DNOPREFIX
+ifeq ($(ENABLE64BIT), Yes)
+ASMFLAGS += -f elf64
+else
+ASMFLAGS += -f elf32
+endif
--- a/codec/build/win32/dec/WelsDecCore.vcproj
+++ b/codec/build/win32/dec/WelsDecCore.vcproj
@@ -349,44 +349,6 @@
Filter="*.asm;*.inc"
>
<File
- RelativePath="..\..\..\decoder\core\asm\asm_inc.asm"
- >
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- </File>
- <File
RelativePath="..\..\..\decoder\core\asm\block_add.asm"
>
<FileConfiguration
@@ -394,17 +356,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -413,23 +374,22 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\cpuid.asm"
+ RelativePath="..\..\..\common\cpuid.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -436,17 +396,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -455,17 +414,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -478,17 +436,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -497,23 +454,22 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\deblock.asm"
+ RelativePath="..\..\..\common\deblock.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -520,17 +476,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -539,23 +494,22 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\expand_picture.asm"
+ RelativePath="..\..\..\common\expand_picture.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -562,17 +516,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -581,17 +534,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -604,17 +556,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -623,23 +574,22 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\mb_copy.asm"
+ RelativePath="..\..\..\common\mb_copy.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -646,17 +596,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -665,23 +614,22 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\mc_chroma.asm"
+ RelativePath="..\..\..\common\mc_chroma.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -688,17 +636,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -707,23 +654,22 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\decoder\core\asm\mc_luma.asm"
+ RelativePath="..\..\..\common\mc_luma.asm"
>
<FileConfiguration
Name="Release|Win32"
@@ -730,17 +676,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -749,59 +694,16 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\..\decoder\core\asm\memzero.asm"
- >
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
--- a/codec/build/win32/dec/WelsDecCore_2010.vcxproj
+++ b/codec/build/win32/dec/WelsDecCore_2010.vcxproj
@@ -94,8 +94,8 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
- <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN32;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>WIN32;NDEBUG;X86_ASM;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<FunctionLevelLinking>true</FunctionLevelLinking>
@@ -125,8 +125,8 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
- <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN64;NDEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>WIN64;NDEBUG;X86_ASM;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<FunctionLevelLinking>true</FunctionLevelLinking>
@@ -151,11 +151,15 @@
<SuppressStartupBanner>true</SuppressStartupBanner>
<OutputFile>$(OutDir)\WelsDecCore.bsc</OutputFile>
</Bscmake>
+ <CustomBuild>
+ <Outputs>$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command>nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_DEBUG;_LIB;X86_ASM;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>true</MinimalRebuild>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
@@ -184,7 +188,7 @@
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common\inc;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\decoder\core\inc;..\..\..\common;..\..\..\api\svc;..\..\..\hwDecoder\core\inc;..\..\..\hwDecoder\dxva\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN64;_DEBUG;_LIB;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
@@ -208,176 +212,45 @@
<SuppressStartupBanner>true</SuppressStartupBanner>
<OutputFile>$(OutDir)\WelsDecCore.bsc</OutputFile>
</Bscmake>
+ <CustomBuild>
+ <Command>nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs>$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
</ItemDefinitionGroup>
<ItemGroup>
- <CustomBuild Include="..\..\..\decoder\core\asm\asm_inc.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
<CustomBuild Include="..\..\..\decoder\core\asm\block_add.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\cpuid.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
<CustomBuild Include="..\..\..\decoder\core\asm\dct.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\deblock.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\expand_picture.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
<CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\mb_copy.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\mc_chroma.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\mc_luma.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\memzero.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
</ItemGroup>
<ItemGroup>
+ <ClInclude Include="..\..\..\common\logging.h" />
<ClInclude Include="..\..\..\decoder\core\inc\as264_common.h" />
<ClInclude Include="..\..\..\decoder\core\inc\au_parser.h" />
<ClInclude Include="..\..\..\decoder\core\inc\bit_stream.h" />
@@ -419,6 +292,7 @@
<ClInclude Include="..\..\..\decoder\core\inc\wels_const.h" />
</ItemGroup>
<ItemGroup>
+ <ClCompile Include="..\..\..\common\logging.cpp" />
<ClCompile Include="..\..\..\decoder\core\src\au_parser.cpp" />
<ClCompile Include="..\..\..\decoder\core\src\bit_stream.cpp" />
<ClCompile Include="..\..\..\decoder\core\src\cpu.cpp" />
@@ -440,6 +314,68 @@
<ClCompile Include="..\..\..\decoder\core\src\decode_slice.cpp" />
<ClCompile Include="..\..\..\decoder\core\src\decoder_core.cpp" />
<ClCompile Include="..\..\..\decoder\core\src\utils.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\..\common\cpuid.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
+ <CustomBuild Include="..\..\..\common\deblock.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
+ <CustomBuild Include="..\..\..\common\expand_picture.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
+ <CustomBuild Include="..\..\..\common\mb_copy.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
+ <CustomBuild Include="..\..\..\common\mc_chroma.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
+ <CustomBuild Include="..\..\..\common\mc_luma.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ </CustomBuild>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
--- a/codec/build/win32/dec/WelsDecCore_2010.vcxproj.filters
+++ b/codec/build/win32/dec/WelsDecCore_2010.vcxproj.filters
@@ -64,6 +64,9 @@
<ClCompile Include="..\..\..\decoder\core\src\utils.cpp">
<Filter>sources</Filter>
</ClCompile>
+ <ClCompile Include="..\..\..\common\logging.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\..\decoder\core\inc\as264_common.h">
@@ -183,39 +186,36 @@
<ClInclude Include="..\..\..\decoder\core\inc\wels_common_basis.h">
<Filter>headers</Filter>
</ClInclude>
+ <ClInclude Include="..\..\..\common\logging.h">
+ <Filter>headers</Filter>
+ </ClInclude>
</ItemGroup>
<ItemGroup>
- <CustomBuild Include="..\..\..\decoder\core\asm\asm_inc.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
<CustomBuild Include="..\..\..\decoder\core\asm\block_add.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\cpuid.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
<CustomBuild Include="..\..\..\decoder\core\asm\dct.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\deblock.asm">
+ <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\expand_picture.asm">
+ <CustomBuild Include="..\..\..\common\mc_luma.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\intra_pred.asm">
+ <CustomBuild Include="..\..\..\common\mc_chroma.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\mb_copy.asm">
+ <CustomBuild Include="..\..\..\common\mb_copy.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\mc_chroma.asm">
+ <CustomBuild Include="..\..\..\common\expand_picture.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\mc_luma.asm">
+ <CustomBuild Include="..\..\..\common\deblock.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\decoder\core\asm\memzero.asm">
+ <CustomBuild Include="..\..\..\common\cpuid.asm">
<Filter>ASM</Filter>
</CustomBuild>
</ItemGroup>
--- a/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
+++ b/codec/build/win32/dec/WelsDecPlus_2010.vcxproj
@@ -107,7 +107,7 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
- <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\common;..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -156,7 +156,7 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
- <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\common;..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -204,7 +204,7 @@
</Midl>
<ClCompile>
<Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\common;..\..\..\decoder\plus\inc;..\..\..\decoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\hwDecoder\plus\inc;..\..\..\hwDecoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSDECPLUS_EXPORTS;HAVE_CACHE_LINE_ALIGN;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<MinimalRebuild>true</MinimalRebuild>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
--- a/codec/build/win32/dec/decConsole_2010.vcxproj
+++ b/codec/build/win32/dec/decConsole_2010.vcxproj
@@ -102,7 +102,7 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
- <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\common;..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -144,7 +144,7 @@
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<InlineFunctionExpansion>OnlyExplicitInline</InlineFunctionExpansion>
- <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\common;..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
@@ -227,7 +227,7 @@
</Midl>
<ClCompile>
<Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <AdditionalIncludeDirectories>..\..\..\common;..\..\..\console\dec\inc;..\..\..\api\svc;..\..\..\common;..\..\..\encoder\core\inc;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN64;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
--- a/codec/build/win32/enc/WelsEncCore.vcproj
+++ b/codec/build/win32/enc/WelsEncCore.vcproj
@@ -53,7 +53,7 @@
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
- PreprocessorDefinitions="WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+ PreprocessorDefinitions="WIN32;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
@@ -101,9 +101,9 @@
/>
</Configuration>
<Configuration
- Name="Release|Win32"
- OutputDirectory=".\..\..\..\..\bin\win32\Release"
- IntermediateDirectory=".\..\..\..\obj\encoder\core\Release"
+ Name="Debug|x64"
+ OutputDirectory=".\..\..\..\..\bin\win64\Debug"
+ IntermediateDirectory=".\..\..\..\obj\encoder\core\Debug"
ConfigurationType="4"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
UseOfMFC="0"
@@ -127,22 +127,20 @@
/>
<Tool
Name="VCMIDLTool"
+ TargetEnvironment="3"
/>
<Tool
Name="VCCLCompilerTool"
- Optimization="3"
- InlineFunctionExpansion="2"
- FavorSizeOrSpeed="1"
- WholeProgramOptimization="true"
- AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
- PreprocessorDefinitions="WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
- StringPooling="true"
- RuntimeLibrary="2"
- EnableFunctionLevelLinking="true"
- PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Release/WelsEncCore.pch"
- AssemblerListingLocation=".\..\..\..\obj\encoder\core\Release/"
- ObjectFile=".\..\..\..\obj\encoder\core\Release/"
- ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Release/"
+ Optimization="0"
+ AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
+ PreprocessorDefinitions="WIN64;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;X86_ASM;MT_ENABLED"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch"
+ AssemblerListingLocation=".\..\..\..\obj\encoder\core\Debug/"
+ ObjectFile=".\..\..\..\obj\encoder\core\Debug/"
+ ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Debug/"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="3"
@@ -152,7 +150,7 @@
/>
<Tool
Name="VCResourceCompilerTool"
- PreprocessorDefinitions="NDEBUG"
+ PreprocessorDefinitions="_DEBUG"
Culture="1033"
/>
<Tool
@@ -160,7 +158,6 @@
/>
<Tool
Name="VCLibrarianTool"
- AdditionalOptions="/LTCG"
OutputFile="$(OutDir)\welsecore.lib"
SuppressStartupBanner="true"
/>
@@ -184,9 +181,9 @@
/>
</Configuration>
<Configuration
- Name="Debug|x64"
- OutputDirectory=".\..\..\..\..\bin\win64\Debug"
- IntermediateDirectory=".\..\..\..\obj\encoder\core\Debug"
+ Name="Release|Win32"
+ OutputDirectory=".\..\..\..\..\bin\win32\Release"
+ IntermediateDirectory=".\..\..\..\obj\encoder\core\Release"
ConfigurationType="4"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
UseOfMFC="0"
@@ -210,20 +207,22 @@
/>
<Tool
Name="VCMIDLTool"
- TargetEnvironment="3"
/>
<Tool
Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api;"
- PreprocessorDefinitions="WIN64;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED"
- MinimalRebuild="true"
- BasicRuntimeChecks="3"
- RuntimeLibrary="3"
- PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch"
- AssemblerListingLocation=".\..\..\..\obj\encoder\core\Debug/"
- ObjectFile=".\..\..\..\obj\encoder\core\Debug/"
- ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Debug/"
+ Optimization="3"
+ InlineFunctionExpansion="2"
+ FavorSizeOrSpeed="1"
+ WholeProgramOptimization="true"
+ AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;X86_ASM;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;"
+ StringPooling="true"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ PrecompiledHeaderFile=".\..\..\..\obj\encoder\core\Release/WelsEncCore.pch"
+ AssemblerListingLocation=".\..\..\..\obj\encoder\core\Release/"
+ ObjectFile=".\..\..\..\obj\encoder\core\Release/"
+ ProgramDataBaseFileName=".\..\..\..\obj\encoder\core\Release/"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="3"
@@ -233,7 +232,7 @@
/>
<Tool
Name="VCResourceCompilerTool"
- PreprocessorDefinitions="_DEBUG"
+ PreprocessorDefinitions="NDEBUG"
Culture="1033"
/>
<Tool
@@ -241,6 +240,7 @@
/>
<Tool
Name="VCLibrarianTool"
+ AdditionalOptions="/LTCG"
OutputFile="$(OutDir)\welsecore.lib"
SuppressStartupBanner="true"
/>
@@ -299,7 +299,7 @@
FavorSizeOrSpeed="1"
WholeProgramOptimization="true"
AdditionalIncludeDirectories="..\..\..\encoder\core\inc,..\..\..\api\svc,..\..\..\WelsThreadLib\api"
- PreprocessorDefinitions="WIN64;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED"
+ PreprocessorDefinitions="WIN64;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
@@ -368,7 +368,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -377,7 +377,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -408,7 +408,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -417,7 +417,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -448,7 +448,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -457,7 +457,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -488,7 +488,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -497,7 +497,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -528,7 +528,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -537,7 +537,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -568,21 +568,21 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories=""
- PreprocessorDefinitions=""
+ PreprocessorDefinitions="OUPUT_REF_PIC"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories=""
- PreprocessorDefinitions="OUPUT_REF_PIC"
+ PreprocessorDefinitions=""
/>
</FileConfiguration>
<FileConfiguration
@@ -608,7 +608,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -617,7 +617,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -648,7 +648,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -657,7 +657,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -688,7 +688,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -697,7 +697,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -728,7 +728,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -737,7 +737,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -768,7 +768,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -777,7 +777,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -808,7 +808,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -817,7 +817,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -852,7 +852,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -861,7 +861,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -892,7 +892,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -901,7 +901,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -932,7 +932,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -941,7 +941,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -972,7 +972,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -981,7 +981,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1012,7 +1012,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1021,7 +1021,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1052,7 +1052,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1061,7 +1061,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1096,7 +1096,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1105,7 +1105,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1140,7 +1140,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1149,7 +1149,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1180,7 +1180,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1189,7 +1189,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1220,7 +1220,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1229,7 +1229,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1260,7 +1260,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1269,7 +1269,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1300,7 +1300,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1309,7 +1309,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1340,7 +1340,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1349,7 +1349,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1380,7 +1380,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1389,7 +1389,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1420,7 +1420,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -1429,7 +1429,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -1686,7 +1686,7 @@
Filter="*.asm;*.inc"
>
<File
- RelativePath="..\..\..\encoder\core\asm\asm_inc.asm"
+ RelativePath="..\..\..\encoder\core\asm\coeff.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1693,80 +1693,40 @@
>
<Tool
Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
- </File>
- <File
- RelativePath="..\..\..\encoder\core\asm\coeff.asm"
- >
<FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\cpuid.asm"
+ RelativePath="..\..\..\common\cpuid.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1773,36 +1733,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1815,42 +1773,40 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\deblock.asm"
+ RelativePath="..\..\..\common\deblock.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1857,42 +1813,40 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\expand_picture.asm"
+ RelativePath="..\..\..\common\expand_picture.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1899,36 +1853,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -1941,42 +1893,40 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\intra_pred_util.asm"
+ RelativePath="..\..\..\common\mb_copy.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -1983,42 +1933,40 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\mb_copy.asm"
+ RelativePath="..\..\..\common\mc_chroma.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2025,42 +1973,40 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\mc_chroma.asm"
+ RelativePath="..\..\..\common\mc_luma.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2067,78 +2013,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
Name="Debug|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\..\encoder\core\asm\mc_luma.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2151,36 +2053,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2193,36 +2093,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2235,36 +2133,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -2277,42 +2173,40 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
</File>
<File
- RelativePath="..\..\..\encoder\core\asm\vaa.asm"
+ RelativePath="..\..\..\common\vaa.asm"
>
<FileConfiguration
Name="Debug|Win32"
@@ -2319,36 +2213,34 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
+ Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
- ExcludedFromBuild="true"
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -O3 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
--- a/codec/build/win32/enc/WelsEncCore_2010.vcxproj
+++ b/codec/build/win32/enc/WelsEncCore_2010.vcxproj
@@ -127,7 +127,7 @@
<ClCompile>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN64;_DEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <PreprocessorDefinitions>WIN64;_DEBUG;X86_ASM;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
<PrecompiledHeaderOutputFile>.\..\..\..\obj\encoder\core\Debug/WelsEncCore.pch</PrecompiledHeaderOutputFile>
@@ -197,7 +197,7 @@
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<WholeProgramOptimization>true</WholeProgramOptimization>
<AdditionalIncludeDirectories>..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\WelsThreadLib\api;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN64;NDEBUG;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <PreprocessorDefinitions>WIN64;NDEBUG;X86_ASM;_LIB;WELS_SVC;ENCODER_CORE;HAVE_CACHE_LINE_ALIGN;MT_ENABLED;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
<FunctionLevelLinking>true</FunctionLevelLinking>
@@ -565,255 +565,154 @@
<ClInclude Include="..\..\..\encoder\core\inc\wels_preprocess.h" />
</ItemGroup>
<ItemGroup>
- <CustomBuild Include="..\..\..\encoder\core\asm\asm_inc.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
<CustomBuild Include="..\..\..\encoder\core\asm\coeff.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\cpuid.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
<CustomBuild Include="..\..\..\encoder\core\asm\dct.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\deblock.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\expand_picture.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred_util.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\mb_copy.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\mc_chroma.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\..\common\cpuid.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\mc_luma.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\common\deblock.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\common\expand_picture.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\common\mb_copy.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\common\mc_chroma.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\common\mc_luma.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\vaa.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <CustomBuild Include="..\..\..\common\vaa.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -O3 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
</CustomBuild>
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
--- a/codec/build/win32/enc/WelsEncCore_2010.vcxproj.filters
+++ b/codec/build/win32/enc/WelsEncCore_2010.vcxproj.filters
@@ -278,52 +278,46 @@
</ClInclude>
</ItemGroup>
<ItemGroup>
- <CustomBuild Include="..\..\..\encoder\core\asm\asm_inc.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
<CustomBuild Include="..\..\..\encoder\core\asm\coeff.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\cpuid.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
<CustomBuild Include="..\..\..\encoder\core\asm\dct.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\deblock.asm">
+ <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\expand_picture.asm">
+ <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred.asm">
+ <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\intra_pred_util.asm">
+ <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\mb_copy.asm">
+ <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\mc_chroma.asm">
+ <CustomBuild Include="..\..\..\common\mc_luma.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\mc_luma.asm">
+ <CustomBuild Include="..\..\..\common\mc_chroma.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\memzero.asm">
+ <CustomBuild Include="..\..\..\common\mb_copy.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\quant.asm">
+ <CustomBuild Include="..\..\..\common\expand_picture.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\satd_sad.asm">
+ <CustomBuild Include="..\..\..\common\deblock.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\score.asm">
+ <CustomBuild Include="..\..\..\common\cpuid.asm">
<Filter>ASM</Filter>
</CustomBuild>
- <CustomBuild Include="..\..\..\encoder\core\asm\vaa.asm">
+ <CustomBuild Include="..\..\..\common\vaa.asm">
<Filter>ASM</Filter>
</CustomBuild>
</ItemGroup>
--- a/codec/build/win32/enc/WelsEncoder_2008.sln
+++ b/codec/build/win32/enc/WelsEncoder_2008.sln
@@ -17,7 +17,7 @@
{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
EndProjectSection
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "..\..\..\..\processing\build\win32\WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "..\..\..\processing\build\win32\WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- a/codec/build/win32/enc/WelsEncoder_2010.sln
+++ b/codec/build/win32/enc/WelsEncoder_2010.sln
@@ -10,7 +10,7 @@
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "encConsole_2010", "encConsole_2010.vcxproj", "{8509E2A8-2CBD-49E2-B564-3EFF1E927459}"
EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "..\..\..\..\processing\build\win32\WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "..\..\..\processing\build\win32\WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- /dev/null
+++ b/codec/common/asm_inc.asm
@@ -1,0 +1,509 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* sse2inc.asm
+;*
+;* Abstract
+;* macro and constant
+;*
+;* History
+;* 8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+;***********************************************************************
+; Options, for DEBUG
+;***********************************************************************
+
+%if 1
+ %define MOVDQ movdqa
+%else
+ %define MOVDQ movdqu
+%endif
+
+%if 1
+ %define WELSEMMS emms
+%else
+ %define WELSEMMS
+%endif
+
+
+;***********************************************************************
+; Macros
+;***********************************************************************
+
+DEFAULT REL
+
+%ifdef WIN64 ; Windows x64 ;************************************
+
+BITS 64
+
+%define arg1 rcx
+%define arg2 rdx
+%define arg3 r8
+%define arg4 r9
+%define arg5 [rsp + push_num*8 + 40]
+%define arg6 [rsp + push_num*8 + 48]
+%define arg7 [rsp + push_num*8 + 56]
+%define arg8 [rsp + push_num*8 + 64]
+%define arg9 [rsp + push_num*8 + 72]
+%define arg10 [rsp + push_num*8 + 80]
+
+%define r0 rcx
+%define r1 rdx
+%define r2 r8
+%define r3 r9
+%define r4 rax
+%define r5 r10
+%define r6 r11
+%define r7 rsp
+
+%define r0d ecx
+%define r1d edx
+%define r2d r8d
+%define r3d r9d
+%define r4d eax
+%define r5d r10d
+%define r6d r11d
+
+%define r0w cx
+%define r1w dx
+%define r2w r8w
+%define r3w r9w
+
+%define r0b cl
+%define r1b dl
+%define r2b r8l
+%define r3b r9l
+
+%define PUSHRFLAGS pushfq
+%define POPRFLAGS popfq
+%define retrq rax
+%define retrd eax
+
+%elifdef UNIX64 ; Unix x64 ;************************************
+
+BITS 64
+
+%define arg1 rdi
+%define arg2 rsi
+%define arg3 rdx
+%define arg4 rcx
+%define arg5 r8
+%define arg6 r9
+%define arg7 [rsp + push_num*8 + 8]
+%define arg8 [rsp + push_num*8 + 16]
+%define arg9 [rsp + push_num*8 + 24]
+%define arg10 [rsp + push_num*8 + 32]
+
+%define r0 rdi
+%define r1 rsi
+%define r2 rdx
+%define r3 rcx
+%define r4 r8
+%define r5 r9
+%define r6 r10
+%define r7 rsp
+
+%define r0d edi
+%define r1d esi
+%define r2d edx
+%define r3d ecx
+%define r4d r8d
+%define r5d r9d
+%define r6d r10d
+
+%define r0w di
+%define r1w si
+%define r2w dx
+%define r3w cx
+
+%define r0b dil
+%define r1b sil
+%define r2b dl
+%define r3b cl
+
+%define PUSHRFLAGS pushfq
+%define POPRFLAGS popfq
+%define retrq rax
+%define retrd eax
+
+%elifdef X86_32 ; X86_32 ;************************************
+
+BITS 32
+
+%define arg1 [esp + push_num*4 + 4]
+%define arg2 [esp + push_num*4 + 8]
+%define arg3 [esp + push_num*4 + 12]
+%define arg4 [esp + push_num*4 + 16]
+%define arg5 [esp + push_num*4 + 20]
+%define arg6 [esp + push_num*4 + 24]
+%define arg7 [esp + push_num*4 + 28]
+%define arg8 [esp + push_num*4 + 32]
+%define arg9 [esp + push_num*4 + 36]
+%define arg10 [esp + push_num*4 + 40]
+
+%define r0 eax
+%define r1 ecx
+%define r2 edx
+%define r3 ebx
+%define r4 esi
+%define r5 edi
+%define r6 ebp
+%define r7 esp
+
+%define r0d eax
+%define r1d ecx
+%define r2d edx
+%define r3d ebx
+%define r4d esi
+%define r5d edi
+%define r6d ebp
+
+%define r0w ax
+%define r1w cx
+%define r2w dx
+%define r3w bx
+
+%define r0b al
+%define r1b cl
+%define r2b dl
+%define r3b bl
+
+%define PUSHRFLAGS pushfd
+%define POPRFLAGS popfd
+%define retrq eax ; 32 bit mode do not support 64 bits regesters
+%define retrd eax
+
+%endif
+
+%macro LOAD_PARA 2
+ mov %1, %2
+%endmacro
+
+%macro LOAD_1_PARA 0
+ %ifdef X86_32
+ mov r0, [esp + push_num*4 + 4]
+ %endif
+%endmacro
+
+%macro LOAD_2_PARA 0
+ %ifdef X86_32
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ %endif
+%endmacro
+
+%macro LOAD_3_PARA 0
+ %ifdef X86_32
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ %endif
+%endmacro
+
+%macro LOAD_4_PARA 0
+ %ifdef X86_32
+ push r3
+ %assign push_num push_num+1
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ %endif
+%endmacro
+
+%macro LOAD_5_PARA 0
+ %ifdef X86_32
+ push r3
+ push r4
+ %assign push_num push_num+2
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ mov r4, [esp + push_num*4 + 20]
+ %elifdef WIN64
+ mov r4, [rsp + push_num*8 + 40]
+ %endif
+%endmacro
+
+%macro LOAD_6_PARA 0
+ %ifdef X86_32
+ push r3
+ push r4
+ push r5
+ %assign push_num push_num+3
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ mov r4, [esp + push_num*4 + 20]
+ mov r5, [esp + push_num*4 + 24]
+ %elifdef WIN64
+ mov r4, [rsp + push_num*8 + 40]
+ mov r5, [rsp + push_num*8 + 48]
+ %endif
+%endmacro
+
+%macro LOAD_7_PARA 0
+ %ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num push_num+4
+ mov r0, [esp + push_num*4 + 4]
+ mov r1, [esp + push_num*4 + 8]
+ mov r2, [esp + push_num*4 + 12]
+ mov r3, [esp + push_num*4 + 16]
+ mov r4, [esp + push_num*4 + 20]
+ mov r5, [esp + push_num*4 + 24]
+ mov r6, [esp + push_num*4 + 28]
+ %elifdef WIN64
+ mov r4, [rsp + push_num*8 + 40]
+ mov r5, [rsp + push_num*8 + 48]
+ mov r6, [rsp + push_num*8 + 56]
+ %elifdef UNIX64
+ mov r6, [rsp + push_num*8 + 8]
+ %endif
+%endmacro
+
+
+
+%macro LOAD_4_PARA_POP 0
+ %ifdef X86_32
+ pop r3
+ %endif
+%endmacro
+
+%macro LOAD_5_PARA_POP 0
+ %ifdef X86_32
+ pop r4
+ pop r3
+ %endif
+%endmacro
+
+%macro LOAD_6_PARA_POP 0
+ %ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+ %endif
+%endmacro
+
+%macro LOAD_7_PARA_POP 0
+ %ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+ %endif
+%endmacro
+
+%macro SIGN_EXTENTION 2
+ %ifndef X86_32
+ movsx %1, %2
+ %endif
+%endmacro
+
+%macro WELS_EXTERN 1
+ %ifdef PREFIX
+ global _%1
+ %define %1 _%1
+ %else
+ global %1
+ %endif
+%endmacro
+
+%macro WELS_AbsW 2
+ pxor %2, %2
+ psubw %2, %1
+ pmaxsw %1, %2
+%endmacro
+
+%macro MMX_XSwap 4
+ movq %4, %2
+ punpckh%1 %4, %3
+ punpckl%1 %2, %3
+%endmacro
+
+; pOut mm1, mm4, mm5, mm3
+%macro MMX_Trans4x4W 5
+ MMX_XSwap wd, %1, %2, %5
+ MMX_XSwap wd, %3, %4, %2
+ MMX_XSwap dq, %1, %3, %4
+ MMX_XSwap dq, %5, %2, %3
+%endmacro
+
+;for TRANSPOSE
+%macro SSE2_XSawp 4
+ movdqa %4, %2
+ punpckl%1 %2, %3
+ punpckh%1 %4, %3
+%endmacro
+
+; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
+%macro SSE2_Trans4x4D 5
+ SSE2_XSawp dq, %1, %2, %5
+ SSE2_XSawp dq, %3, %4, %2
+ SSE2_XSawp qdq, %1, %3, %4
+ SSE2_XSawp qdq, %5, %2, %3
+%endmacro
+
+;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
+%macro SSE2_TransTwo4x4W 5
+ SSE2_XSawp wd, %1, %2, %5
+ SSE2_XSawp wd, %3, %4, %2
+ SSE2_XSawp dq, %1, %3, %4
+ SSE2_XSawp dq, %5, %2, %3
+ SSE2_XSawp qdq, %1, %5, %2
+ SSE2_XSawp qdq, %4, %3, %5
+%endmacro
+
+;in: m1, m2, m3, m4, m5, m6, m7, m8
+;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+%macro SSE2_TransTwo8x8B 9
+ movdqa %9, %8
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %9, %4
+ SSE2_XSawp bw, %7, %6, %4
+
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %9
+ movdqa %9, %3
+ SSE2_XSawp wd, %7, %4, %3
+
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %9
+ movdqa %9, %5
+ SSE2_XSawp dq, %7, %3, %5
+
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %9
+ movdqa %9, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %9
+%endmacro
+
+;xmm0, xmm6, xmm7, [eax], [ecx]
+;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
+%macro SSE2_LoadDiff8P 5
+ movq %1, %4
+ punpcklbw %1, %3
+ movq %2, %5
+ punpcklbw %2, %3
+ psubw %1, %2
+%endmacro
+
+; m2 = m1 + m2, m1 = m1 - m2
+%macro SSE2_SumSub 3
+ movdqa %3, %2
+ paddw %2, %1
+ psubw %1, %3
+%endmacro
+
+
+%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
+ mov %3h, %3l
+ movd %1, e%3x ; i.e, 1% = eax (=b0)
+ pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
+ pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
+
+;copy a dw into a xmm for 8 times
+%macro SSE2_Copy8Times 2
+ movd %1, %2
+ punpcklwd %1, %1
+ pshufd %1, %1, 0
+%endmacro
+
+;copy a db into a xmm for 16 times
+%macro SSE2_Copy16Times 2
+ movd %1, %2
+ pshuflw %1, %1, 0
+ punpcklqdq %1, %1
+ packuswb %1, %1
+%endmacro
+
+
+
+;***********************************************************************
+;preprocessor constants
+;***********************************************************************
+;dw 32,32,32,32,32,32,32,32 for xmm
+;dw 32,32,32,32 for mm
+%macro WELS_DW32 1
+ pcmpeqw %1,%1
+ psrlw %1,15
+ psllw %1,5
+%endmacro
+
+;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
+;dw 1, 1, 1, 1 for mm
+%macro WELS_DW1 1
+ pcmpeqw %1,%1
+ psrlw %1,15
+%endmacro
+
+;all 0 for xmm and mm
+%macro WELS_Zero 1
+ pxor %1, %1
+%endmacro
+
+;dd 1, 1, 1, 1 for xmm
+;dd 1, 1 for mm
+%macro WELS_DD1 1
+ pcmpeqw %1,%1
+ psrld %1,31
+%endmacro
+
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+%macro WELS_DB1 1
+ pcmpeqw %1,%1
+ psrlw %1,15
+ packuswb %1,%1
+%endmacro
+
+
+
+
+
+
--- /dev/null
+++ b/codec/common/cpuid.asm
@@ -1,0 +1,220 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* cpu_mmx.asm
+;*
+;* Abstract
+;* verify cpuid feature support and cpuid detection
+;*
+;* History
+;* 04/29/2009 Created
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;******************************************************************************************
+; Macros
+;******************************************************************************************
+
+
+;******************************************************************************************
+; Code
+;******************************************************************************************
+
+SECTION .text
+
+; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
+; section CPUID - CPU Identification
+
+WELS_EXTERN WelsCPUIdVerify
+ALIGN 16
+;******************************************************************************************
+; int32_t WelsCPUIdVerify()
+;******************************************************************************************
+WelsCPUIdVerify:
+ push r1
+ PUSHRFLAGS
+ PUSHRFLAGS
+
+ pop r1
+ mov eax, r1d
+ xor eax, 00200000h
+ xor eax, r1d
+ POPRFLAGS
+ pop r1
+ ret
+
+WELS_EXTERN WelsCPUId
+ALIGN 16
+;****************************************************************************************************
+; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
+;****************************************************************************************************
+%ifdef WIN64
+
+WelsCPUId:
+ push rbx
+ push rdx
+
+ mov eax, ecx
+ cpuid
+ mov [r9], ecx
+ mov [r8], ebx
+ mov rcx, [rsp + 2*8 + 40]
+ mov [rcx], edx
+ pop rdx
+ mov [rdx], eax
+
+ pop rbx
+ ret
+
+%elifdef UNIX64
+WelsCPUId:
+ push rbx
+ push rcx
+ push rdx
+
+ mov eax, edi
+ cpuid
+ mov [r8], edx
+ pop rdx
+ pop r8
+ mov [r8], ecx
+ mov [rdx], ebx
+ mov [rsi], eax
+
+ pop rbx
+ ret
+
+%elifdef X86_32
+
+WelsCPUId:
+ push ebx
+ push edi
+
+ mov eax, [esp+12] ; operating index
+ cpuid ; cpuid
+
+ ; processing various information return
+ mov edi, [esp+16]
+ mov [edi], eax
+ mov edi, [esp+20]
+ mov [edi], ebx
+ mov edi, [esp+24]
+ mov [edi], ecx
+ mov edi, [esp+28]
+ mov [edi], edx
+
+ pop edi
+ pop ebx
+ ret
+
+%endif
+
+WELS_EXTERN WelsCPUSupportAVX
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportAVX:
+%ifdef WIN64
+ mov eax, ecx
+ mov ecx, edx
+%elifdef UNIX64
+ mov eax, edi
+ mov ecx, esi
+%else
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
+%endif
+
+ ; refer to detection of AVX addressed in INTEL AVX manual document
+ and ecx, 018000000H
+ cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
+ jne avx_not_supported
+ ; processor supports AVX instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne avx_not_supported
+ mov eax, 1
+ ret
+avx_not_supported:
+ mov eax, 0
+ ret
+
+
+WELS_EXTERN WelsCPUSupportFMA
+; need call after cpuid=1 and eax, ecx flag got then
+ALIGN 16
+;****************************************************************************************************
+; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
+;****************************************************************************************************
+WelsCPUSupportFMA:
+%ifdef WIN64
+ mov eax, ecx
+ mov ecx, edx
+%elifdef UNIX64
+ mov eax, edi
+ mov ecx, esi
+%else
+ mov eax, [esp+4]
+ mov ecx, [esp+8]
+%endif
+ ; refer to detection of FMA addressed in INTEL AVX manual document
+ and ecx, 018001000H
+ cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
+ jne fma_not_supported
+ ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
+ mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
+ XGETBV ; result in EDX:EAX
+ and eax, 06H
+ cmp eax, 06H ; check OS has enabled both XMM and YMM state support
+ jne fma_not_supported
+ mov eax, 1
+ ret
+fma_not_supported:
+ mov eax, 0
+ ret
+
+WELS_EXTERN WelsEmms
+ALIGN 16
+;******************************************************************************************
+; void WelsEmms()
+;******************************************************************************************
+WelsEmms:
+ emms ; empty mmx technology states
+ ret
+
+
+
--- /dev/null
+++ b/codec/common/deblock.asm
@@ -1,0 +1,5325 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* deblock.asm
+;*
+;* Abstract
+;* edge loop
+;*
+;* History
+;* 08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+ALIGN 16
+FOUR_16B_SSE2: dw 4, 4, 4, 4, 4, 4, 4, 4
+
+
+SECTION .text
+
+%ifdef WIN64
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+DeblockLumaLt4V_sse2:
+ push rbp
+ mov r11,[esp + 16 + 20h] ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,r8d
+ movd xmm2,r9d
+ mov qword [rbp+180h],r12
+ mov r10,rcx
+ movsxd r12,edx
+ add edx,edx
+ movsxd rdx,edx
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rcx]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx edx,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,edx
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rcx]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rcx]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rcx]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rcx]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rcx]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rcx],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rcx],xmm8
+ movdqa [r12+rcx],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
+
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+DeblockLumaEq4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+
+ALIGN 16
+DeblockChromaLt4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rdi
+ sub rsp,0C8h
+ mov r10,qword [rax + 30h] ; pTC
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 0C8h + 38h] ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rdi
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaEq4V_sse2
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ mov rax,rsp
+ push rbx
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ movsx eax,word [rsp + 90h + 8h + 28h] ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movaps xmm7,[rsp+70h]
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movaps xmm6,[r11-10h]
+ movaps xmm8,[r11-30h]
+ movaps xmm9,[r11-40h]
+ movq [rbx],xmm1
+ movaps xmm10,[r11-50h]
+ movaps xmm11,[r11-60h]
+ movaps xmm12,[r11-70h]
+ movaps xmm13,[r11-80h]
+ mov rsp,r11
+ pop rbx
+ ret
+
+
+
+
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+ALIGN 16
+DeblockChromaEq4H_sse2:
+ mov rax,rsp
+ mov [rax+20h],rbx
+ push rdi
+ sub rsp,140h
+ mov rdi,rdx
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ movsx eax,word [rsp+170h] ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop rdi
+ ret
+
+
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+ALIGN 16
+DeblockChromaLt4H_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+ push rsi
+ push rdi
+ push r12
+ sub rsp,170h
+
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, [rsp+1C8h] ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ movsx eax,word [rsp+1C0h] ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r12
+ pop rdi
+ pop rsi
+ pop rbp
+ pop rbx
+ ret
+
+
+
+%elifdef UNIX64
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+DeblockLumaLt4V_sse2:
+ push rbp
+ mov r11,r8 ; pTC
+ sub rsp,1B0h
+ lea rbp,[rsp+20h]
+ movd xmm4,edx
+ movd xmm2,ecx
+ mov qword [rbp+180h],r12
+ mov r10,rdi
+ movsxd r12,esi
+ add rsi,rsi
+ movsxd rdx,esi
+ sub r10,r12
+ movsx r8d,byte [r11]
+ pxor xmm3,xmm3
+ punpcklwd xmm2,xmm2
+ movaps [rbp+50h],xmm14
+ lea rax,[r12+r12*2]
+ movdqa xmm14,[rdx+rdi]
+ neg rax
+ pshufd xmm0,xmm2,0
+ movd xmm2,r8d
+ movsx rsi,byte [r11+1]
+ movsx r8d,byte [r11+2]
+ movsx r11d,byte [r11+3]
+ movaps [rbp+70h],xmm12
+ movd xmm1,esi
+ movaps [rbp+80h],xmm11
+ movd xmm12,r8d
+ movd xmm11,r11d
+ movdqa xmm5, [rax+rdi]
+ lea rax,[r12+r12]
+ punpcklwd xmm12,xmm12
+ neg rax
+ punpcklwd xmm11,xmm11
+ movaps [rbp],xmm8
+ movdqa xmm8, [r10]
+ punpcklwd xmm2,xmm2
+ punpcklwd xmm1,xmm1
+ punpcklqdq xmm12,xmm12
+ punpcklqdq xmm11,xmm11
+ punpcklqdq xmm2,xmm2
+ punpcklqdq xmm1,xmm1
+ shufps xmm12,xmm11,88h
+ movdqa xmm11,xmm8
+ movaps [rbp+30h],xmm9
+ movdqa xmm9,[rdi]
+ shufps xmm2,xmm1,88h
+ movdqa xmm1,xmm5
+ punpcklbw xmm11,xmm3
+ movaps [rbp+20h],xmm6
+ movaps [rbp+60h],xmm13
+ movdqa xmm13,xmm11
+ movaps [rbp+90h],xmm10
+ movdqa xmm10,xmm9
+ movdqa xmm6,[rax+rdi]
+ punpcklbw xmm1,xmm3
+ movaps [rbp+0A0h],xmm12
+ psubw xmm13,xmm1
+ movaps [rbp+40h],xmm15
+ movdqa xmm15,xmm14
+ movaps [rbp+10h],xmm7
+ movdqa xmm7,xmm6
+ punpcklbw xmm10,xmm3
+ movdqa xmm12,[r12+rdi]
+ punpcklbw xmm7,xmm3
+ punpcklbw xmm12,xmm3
+ punpcklbw xmm15,xmm3
+ pabsw xmm3,xmm13
+ movdqa xmm13,xmm10
+ psubw xmm13,xmm15
+ movdqa [rbp+0F0h],xmm15
+ pabsw xmm15,xmm13
+ movdqa xmm13,xmm11
+ movdqa [rbp+0B0h],xmm1
+ movdqa xmm1,xmm0
+ pavgw xmm13,xmm10
+ pcmpgtw xmm1,xmm3
+ movdqa [rbp+120h],xmm13
+ movaps xmm13,xmm2
+ punpcklwd xmm4,xmm4
+ movdqa xmm3,xmm0
+ movdqa [rbp+100h],xmm1
+ psubw xmm13,xmm1
+ movdqa xmm1,xmm10
+ pcmpgtw xmm3,xmm15
+ pshufd xmm4,xmm4,0
+ psubw xmm1,xmm11
+ movdqa [rbp+0D0h],xmm10
+ psubw xmm13,xmm3
+ movdqa [rbp+110h],xmm3
+ pabsw xmm15,xmm1
+ movdqa xmm3,xmm4
+ psubw xmm10,xmm12
+ pcmpgtw xmm3,xmm15
+ pabsw xmm15,xmm10
+ movdqa xmm10,xmm0
+ psllw xmm1,2
+ movdqa [rbp+0C0h],xmm11
+ psubw xmm11,xmm7
+ pcmpgtw xmm10,xmm15
+ pabsw xmm11,xmm11
+ movdqa xmm15,xmm0
+ pand xmm3,xmm10
+ pcmpgtw xmm15,xmm11
+ movaps xmm11,xmm2
+ pxor xmm10,xmm10
+ pand xmm3,xmm15
+ pcmpgtw xmm11,xmm10
+ pcmpeqw xmm10,xmm2
+ por xmm11,xmm10
+ pand xmm3,xmm11
+ movdqa xmm11,xmm7
+ psubw xmm11,xmm12
+ pxor xmm15,xmm15
+ paddw xmm11,xmm1
+ psubw xmm15,xmm13
+ movdqa [rbp+0E0h],xmm12
+ paddw xmm11,[FOUR_16B_SSE2]
+ pxor xmm12,xmm12
+ psraw xmm11,3
+ punpckhbw xmm8,xmm12
+ pmaxsw xmm15,xmm11
+ punpckhbw xmm5,xmm12
+ movdqa xmm11,xmm8
+ pminsw xmm13,xmm15
+ psubw xmm11,xmm5
+ punpckhbw xmm9,xmm12
+ pand xmm13,xmm3
+ movdqa [rbp+130h],xmm13
+ pabsw xmm13,xmm11
+ punpckhbw xmm14,xmm12
+ movdqa xmm11,xmm9
+ psubw xmm11,xmm14
+ movdqa xmm15,xmm0
+ movdqa [rbp+140h],xmm14
+ pabsw xmm14,xmm11
+ movdqa xmm11,xmm8
+ pcmpgtw xmm15,xmm14
+ movdqa xmm1,[r12+rdi]
+ pavgw xmm11,xmm9
+ movdqa [rbp+170h],xmm11
+ movdqa xmm10,xmm9
+ punpckhbw xmm6,xmm12
+ psubw xmm10,xmm8
+ punpckhbw xmm1,xmm12
+ movdqa xmm12,xmm0
+ movaps xmm11,[rbp+0A0h]
+ pcmpgtw xmm12,xmm13
+ movaps xmm13,xmm11
+ psubw xmm13,xmm12
+ movdqa [rbp+160h],xmm15
+ psubw xmm13,xmm15
+ movdqa xmm15,xmm9
+ psubw xmm15,xmm1
+ movdqa [rbp+150h],xmm12
+ pabsw xmm12,xmm10
+ pabsw xmm14,xmm15
+ movdqa xmm15,xmm8
+ pcmpgtw xmm4,xmm12
+ movdqa xmm12,xmm0
+ psubw xmm15,xmm6
+ pcmpgtw xmm12,xmm14
+ pabsw xmm14,xmm15
+ psllw xmm10,2
+ pcmpgtw xmm0,xmm14
+ movdqa xmm14,xmm6
+ psubw xmm14,xmm1
+ pand xmm4,xmm12
+ paddw xmm14,xmm10
+ pand xmm4,xmm0
+ paddw xmm14,[FOUR_16B_SSE2]
+ pxor xmm15,xmm15
+ movaps xmm12,xmm11
+ psubw xmm15,xmm13
+ pxor xmm0,xmm0
+ psraw xmm14,3
+ pcmpgtw xmm12,xmm0
+ pcmpeqw xmm0,xmm11
+ pmaxsw xmm15,xmm14
+ por xmm12,xmm0
+ movdqa xmm0,[rbp+120h]
+ pminsw xmm13,xmm15
+ movdqa xmm15,[rbp+0B0h]
+ movdqa xmm10,xmm7
+ pand xmm4,xmm12
+ paddw xmm15,xmm0
+ pxor xmm12,xmm12
+ paddw xmm10,xmm7
+ movdqa xmm14,xmm12
+ psubw xmm15,xmm10
+ psubw xmm14,xmm2
+ psraw xmm15,1
+ pmaxsw xmm15,xmm14
+ movdqa xmm10,xmm6
+ pminsw xmm15,xmm2
+ paddw xmm10,xmm6
+ pand xmm15,xmm3
+ psubw xmm12,xmm11
+ pand xmm15,[rbp+100h]
+ pand xmm13,xmm4
+ paddw xmm7,xmm15
+ paddw xmm8,xmm13
+ movdqa xmm15,[rbp+170h]
+ psubw xmm9,xmm13
+ paddw xmm5,xmm15
+ psubw xmm5,xmm10
+ psraw xmm5,1
+ pmaxsw xmm5,xmm12
+ pminsw xmm5,xmm11
+ pand xmm5,xmm4
+ pand xmm5,[rbp+150h]
+ paddw xmm6,xmm5
+ movdqa xmm5,[rbp+0C0h]
+ packuswb xmm7,xmm6
+ movdqa xmm6,[rbp+130h]
+ paddw xmm5,xmm6
+ packuswb xmm5,xmm8
+ movdqa xmm8,[rbp+0D0h]
+ psubw xmm8,xmm6
+ movdqa xmm6,[rbp+0F0h]
+ paddw xmm6,xmm0
+ movdqa xmm0,[rbp+0E0h]
+ packuswb xmm8,xmm9
+ movdqa xmm9,xmm0
+ paddw xmm9,xmm0
+ psubw xmm6,xmm9
+ psraw xmm6,1
+ pmaxsw xmm14,xmm6
+ pminsw xmm2,xmm14
+ pand xmm2,xmm3
+ pand xmm2,[rbp+110h]
+ paddw xmm0,xmm2
+ movdqa xmm2,[rbp+140h]
+ paddw xmm2,xmm15
+ movdqa xmm15,xmm1
+ paddw xmm15,xmm1
+ psubw xmm2,xmm15
+ psraw xmm2,1
+ pmaxsw xmm12,xmm2
+ pminsw xmm11,xmm12
+ pand xmm11,xmm4
+ pand xmm11,[rbp+160h]
+ paddw xmm1,xmm11
+ movdqa [rax+rdi],xmm7
+ movdqa [r10],xmm5
+ packuswb xmm0,xmm1
+ movdqa [rdi],xmm8
+ movdqa [r12+rdi],xmm0
+ mov r12,qword [rbp+180h]
+ lea rsp,[rbp+190h]
+ pop rbp
+ ret
+
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+DeblockLumaEq4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ sub rsp,1D8h
+ movaps [rax-38h],xmm6
+ movaps [rax-48h],xmm7
+ movaps [rax-58h],xmm8
+ pxor xmm1,xmm1
+ movsxd r10,edx
+ mov rbp,rcx
+ mov r11d,r8d
+ mov rdx,rcx
+ mov rdi,rbp
+ mov rbx,rbp
+ movdqa xmm5,[rbp]
+ movaps [rax-68h],xmm9
+ movaps [rax-78h],xmm10
+ punpcklbw xmm5,xmm1
+ movaps [rax-88h],xmm11
+ movaps [rax-98h],xmm12
+ movaps [rax-0A8h],xmm13
+ movaps [rax-0B8h],xmm14
+ movdqa xmm14,[r10+rbp]
+ movaps [rax-0C8h],xmm15
+ lea eax,[r10*4]
+ movsxd r8,eax
+ lea eax,[r10+r10*2]
+ movsxd rcx,eax
+ lea eax,[r10+r10]
+ sub rdx,r8
+ punpcklbw xmm14,xmm1
+ movdqa [rsp+90h],xmm5
+ movdqa [rsp+30h],xmm14
+ movsxd rsi,eax
+ movsx eax,r11w
+ sub rdi,rcx
+ sub rbx,rsi
+ mov r8,rbp
+ sub r8,r10
+ movd xmm0,eax
+ movsx eax,r9w
+ movdqa xmm12,[rdi]
+ movdqa xmm6, [rsi+rbp]
+ movdqa xmm13,[rbx]
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm6,xmm1
+ movdqa xmm8,[r8]
+ movd xmm0,eax
+ movdqa xmm10,xmm11
+ mov eax,2
+ punpcklbw xmm8,xmm1
+ punpcklbw xmm12,xmm1
+ cwde
+ punpcklwd xmm0,xmm0
+ psraw xmm10,2
+ movdqa xmm1,xmm8
+ movdqa [rsp+0F0h],xmm13
+ movdqa [rsp+0B0h],xmm8
+ pshufd xmm7,xmm0,0
+ psubw xmm1,xmm13
+ movdqa xmm0,xmm5
+ movdqa xmm4,xmm7
+ movdqa xmm2,xmm7
+ psubw xmm0,xmm8
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm5
+ movdqa [rsp+40h],xmm7
+ movdqa [rsp+60h],xmm6
+ pcmpgtw xmm4,xmm0
+ psubw xmm1,xmm14
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm4,xmm2
+ movdqa xmm0,xmm11
+ pcmpgtw xmm0,xmm3
+ pand xmm4,xmm0
+ movd xmm0,eax
+ movdqa [rsp+20h],xmm4
+ punpcklwd xmm0,xmm0
+ pshufd xmm2,xmm0,0
+ paddw xmm10,xmm2
+ movdqa [rsp+0A0h],xmm2
+ movdqa xmm15,xmm7
+ pxor xmm4,xmm4
+ movdqa xmm0,xmm8
+ psubw xmm0,xmm12
+ mov eax,4
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm10
+ cwde
+ pcmpgtw xmm15,xmm0
+ pcmpgtw xmm1,xmm3
+ movdqa xmm3,xmm7
+ movdqa xmm7,[rdx]
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm6
+ pand xmm15,xmm1
+ punpcklbw xmm7,xmm4
+ movdqa xmm9,xmm15
+ pabsw xmm0,xmm0
+ psllw xmm7,1
+ pandn xmm9,xmm12
+ pcmpgtw xmm3,xmm0
+ paddw xmm7,xmm12
+ movd xmm0,eax
+ pand xmm3,xmm1
+ paddw xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ paddw xmm7,xmm12
+ pshufd xmm1,xmm0,0
+ paddw xmm7,xmm13
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm6
+ paddw xmm7,xmm8
+ movdqa [rsp+70h],xmm1
+ paddw xmm7,xmm5
+ movdqa [rsp+120h],xmm0
+ movdqa xmm0,[rcx+rbp]
+ punpcklbw xmm0,xmm4
+ paddw xmm7,xmm1
+ movdqa xmm4,xmm15
+ psllw xmm0,1
+ psraw xmm7,3
+ paddw xmm0,xmm6
+ pand xmm7,xmm15
+ paddw xmm0,xmm6
+ paddw xmm0,xmm6
+ paddw xmm0,xmm14
+ movdqa xmm6,xmm15
+ paddw xmm0,xmm5
+ pandn xmm6,xmm13
+ paddw xmm0,xmm8
+ paddw xmm0,xmm1
+ psraw xmm0,3
+ movdqa xmm1,xmm12
+ paddw xmm1,xmm13
+ pand xmm0,xmm3
+ movdqa [rsp+100h],xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,xmm5
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm3
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pandn xmm0,xmm14
+ pand xmm4,xmm1
+ movdqa [rsp+0E0h],xmm0
+ movdqa xmm0,xmm5
+ paddw xmm0,xmm8
+ movdqa xmm1,[rsp+60h]
+ paddw xmm1,xmm14
+ movdqa xmm14,xmm3
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm8
+ paddw xmm0,[rsp+30h]
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ pand xmm14,xmm1
+ movdqa xmm1,xmm13
+ paddw xmm1,xmm13
+ paddw xmm1,xmm0
+ paddw xmm1,xmm2
+ psraw xmm1,2
+ movdqa xmm0,[rsp+30h]
+ movdqa xmm2,xmm13
+ movdqa xmm5,xmm15
+ paddw xmm0,[rsp+70h]
+ pandn xmm5,xmm1
+ paddw xmm2,xmm8
+ movdqa xmm8,[rsp+90h]
+ movdqa xmm1,xmm12
+ paddw xmm2,xmm8
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,xmm8
+ movdqa xmm8,xmm3
+ movdqa xmm2,[rsp+30h]
+ paddw xmm0,xmm13
+ psraw xmm1,3
+ pand xmm15,xmm1
+ movdqa xmm1,xmm2
+ paddw xmm1,xmm2
+ paddw xmm2,[rsp+90h]
+ paddw xmm2,[rsp+0B0h]
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ movdqa xmm13,[r8]
+ paddw xmm0, [rsp+70h]
+ paddw xmm1, [rsp+0A0h]
+ psllw xmm2,1
+ paddw xmm2,xmm0
+ psraw xmm1,2
+ movdqa xmm0, [rdi]
+ pandn xmm8,xmm1
+ movdqa xmm1, [rsp+60h]
+ paddw xmm1,xmm2
+ movdqa xmm2, [rbx]
+ psraw xmm1,3
+ pand xmm3,xmm1
+ movdqa xmm1, [rbp]
+ movdqa [rsp+0D0h],xmm3
+ pxor xmm3,xmm3
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm1,xmm3
+ punpckhbw xmm13,xmm3
+ movdqa [rsp+0C0h],xmm0
+ movdqa xmm0,[r10+rbp]
+ movdqa [rsp],xmm1
+ punpckhbw xmm0,xmm3
+ punpckhbw xmm2,xmm3
+ movdqa [rsp+80h],xmm0
+ movdqa xmm0,[rsi+rbp]
+ movdqa [rsp+10h],xmm13
+ punpckhbw xmm0,xmm3
+ movdqa [rsp+50h],xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm13
+ psubw xmm0,xmm13
+ psubw xmm1,xmm2
+ pabsw xmm3,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,[rsp]
+ movdqa xmm13,[rsp+40h]
+ movdqa [rsp+110h],xmm2
+ psubw xmm1, [rsp+80h]
+ pcmpgtw xmm13,xmm0
+ pcmpgtw xmm11,xmm3
+ pabsw xmm0,xmm1
+ pcmpgtw xmm10,xmm3
+ movdqa xmm1, [rsp+40h]
+ movdqa xmm2,xmm1
+ movdqa xmm3,xmm1
+ pcmpgtw xmm2,xmm0
+ movdqa xmm0, [rsp+10h]
+ pand xmm13,xmm2
+ pand xmm13,xmm11
+ movdqa xmm11,[rsp+0C0h]
+ psubw xmm0,xmm11
+ pabsw xmm0,xmm0
+ pcmpgtw xmm3,xmm0
+ pand xmm3,xmm10
+ movdqa xmm0,[rsp]
+ psubw xmm0,[rsp+50h]
+ movdqa xmm2,[rdx]
+ pabsw xmm0,xmm0
+ por xmm7,xmm9
+ movdqa xmm9,[rsp+20h]
+ pcmpgtw xmm1,xmm0
+ pand xmm9,xmm7
+ movdqa xmm7,[rsp+20h]
+ movdqa xmm0,xmm7
+ pandn xmm0,xmm12
+ movdqa xmm12,[rsp+110h]
+ pand xmm1,xmm10
+ movdqa xmm10,[rsp+70h]
+ movdqa [rsp+40h],xmm1
+ movdqa xmm1,xmm13
+ por xmm9,xmm0
+ pxor xmm0,xmm0
+ por xmm4,xmm6
+ movdqa xmm6,xmm7
+ punpckhbw xmm2,xmm0
+ por xmm15,xmm5
+ movdqa xmm5,[rsp+20h]
+ movdqa xmm0,xmm3
+ psllw xmm2,1
+ pandn xmm0,xmm11
+ pand xmm6,xmm4
+ movdqa xmm4,[rsp]
+ paddw xmm2,xmm11
+ pand xmm5,xmm15
+ movdqa xmm15,[rsp+20h]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm11
+ paddw xmm2,xmm12
+ paddw xmm2,[rsp+10h]
+ paddw xmm2,[rsp]
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ pand xmm2,xmm3
+ por xmm2,xmm0
+ pand xmm1,xmm2
+ movdqa xmm0,xmm13
+ movdqa xmm2,xmm11
+ pandn xmm0,xmm11
+ paddw xmm2,xmm12
+ por xmm1,xmm0
+ packuswb xmm9,xmm1
+ movdqa xmm0,xmm7
+ movdqa xmm7,[rsp+0A0h]
+ pandn xmm0,[rsp+0F0h]
+ movdqa xmm1,xmm3
+ por xmm6,xmm0
+ movdqa xmm0,[rsp+10h]
+ paddw xmm0,xmm4
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm3
+ pandn xmm0,xmm12
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ pandn xmm0,xmm12
+ movdqa xmm1,xmm12
+ paddw xmm1,[rsp+10h]
+ por xmm2,xmm0
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+0B0h]
+ paddw xmm1,xmm4
+ packuswb xmm6,xmm2
+ movdqa xmm2,xmm3
+ psllw xmm1,1
+ por xmm5,xmm0
+ movdqa xmm0,[rsp+80h]
+ paddw xmm0,xmm10
+ paddw xmm1,xmm0
+ paddw xmm11,xmm1
+ psraw xmm11,3
+ movdqa xmm1,xmm12
+ pand xmm2,xmm11
+ paddw xmm1,xmm12
+ movdqa xmm11,[rsp+80h]
+ movdqa xmm0, [rsp+10h]
+ por xmm14,[rsp+0E0h]
+ paddw xmm0,xmm11
+ movdqa xmm4,xmm15
+ paddw xmm1,xmm0
+ movdqa xmm0,xmm13
+ paddw xmm1,xmm7
+ psraw xmm1,2
+ pandn xmm3,xmm1
+ por xmm2,xmm3
+ movdqa xmm1,xmm13
+ movdqa xmm3,[rsp+10h]
+ pandn xmm0,xmm3
+ pand xmm1,xmm2
+ movdqa xmm2,xmm11
+ paddw xmm2,[rsp]
+ por xmm1,xmm0
+ movdqa xmm0,[rsp+0D0h]
+ por xmm0,xmm8
+ paddw xmm2,xmm3
+ packuswb xmm5,xmm1
+ movdqa xmm8,[rsp+40h]
+ movdqa xmm1,[rsp+50h]
+ movdqa xmm3,xmm8
+ pand xmm4,xmm0
+ psllw xmm2,1
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+90h]
+ por xmm4,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm10
+ paddw xmm2,xmm0
+ paddw xmm1,xmm2
+ movdqa xmm0,[rsp]
+ movdqa xmm2,xmm11
+ paddw xmm0,xmm12
+ movdqa xmm12,[rsp]
+ paddw xmm2,xmm11
+ paddw xmm2,xmm0
+ psraw xmm1,3
+ movdqa xmm0,xmm8
+ pand xmm3,xmm1
+ paddw xmm2,xmm7
+ movdqa xmm1,xmm13
+ psraw xmm2,2
+ pandn xmm0,xmm2
+ por xmm3,xmm0
+ movdqa xmm2,[rsp+50h]
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm3
+ paddw xmm2,xmm11
+ movdqa xmm3,xmm15
+ por xmm1,xmm0
+ pand xmm3,xmm14
+ movdqa xmm14,[rsp+10h]
+ movdqa xmm0,xmm15
+ pandn xmm0,[rsp+30h]
+ packuswb xmm4,xmm1
+ movdqa xmm1,xmm8
+ por xmm3,xmm0
+ movdqa xmm0,xmm12
+ paddw xmm0,xmm14
+ paddw xmm2,xmm0
+ paddw xmm2,xmm7
+ movdqa xmm0,xmm8
+ pandn xmm0,xmm11
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ movdqa xmm2,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm11
+ pand xmm2,xmm1
+ movdqa xmm1,xmm15
+ por xmm2,xmm0
+ packuswb xmm3,xmm2
+ movdqa xmm0,[rsp+100h]
+ por xmm0,[rsp+120h]
+ pand xmm1,xmm0
+ movdqa xmm2,[rcx+rbp]
+ movdqa xmm7,[rsp+50h]
+ pandn xmm15,[rsp+60h]
+ lea r11,[rsp+1D8h]
+ pxor xmm0,xmm0
+ por xmm1,xmm15
+ movaps xmm15,[r11-0A8h]
+ movdqa [rdi],xmm9
+ movaps xmm9,[r11-48h]
+ punpckhbw xmm2,xmm0
+ psllw xmm2,1
+ paddw xmm2,xmm7
+ paddw xmm2,xmm7
+ movdqa [rbx],xmm6
+ movaps xmm6,[r11-18h]
+ paddw xmm2,xmm7
+ paddw xmm2,xmm11
+ movaps xmm11,[r11-68h]
+ paddw xmm2,xmm12
+ movaps xmm12,[r11-78h]
+ paddw xmm2,xmm14
+ paddw xmm2,xmm10
+ psraw xmm2,3
+ movaps xmm10,[r11-58h]
+ movaps xmm14,[r11-98h]
+ movdqa xmm0,xmm13
+ pand xmm2,xmm8
+ pandn xmm8,xmm7
+ pandn xmm13,xmm7
+ por xmm2,xmm8
+ movaps xmm7,[r11-28h]
+ movaps xmm8,[r11-38h]
+ movdqa [r8],xmm5
+ pand xmm0,xmm2
+ por xmm0,xmm13
+ packuswb xmm1,xmm0
+ movaps xmm13,[r11-88h]
+ movdqa [rbp],xmm4
+ movdqa [r10+rbp],xmm3
+ movdqa [rsi+rbp],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+ALIGN 16
+DeblockChromaLt4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+ mov r10, rdx
+ mov r11, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rsi, r10
+ mov r10, r9
+ mov rbp, r8
+ mov r8, rsi
+ mov r9, r11
+ sub rsp,0C8h
+ pxor xmm1,xmm1
+ mov rbx,rcx
+ movsxd r11,r8d
+ movsx ecx,byte [r10]
+ movsx r8d,byte [r10+2]
+ mov rdi,rdx
+ movq xmm2,[rbx]
+ movq xmm9,[r11+rbx]
+ movsx edx,byte [r10+1]
+ mov word [rsp+2],cx
+ mov word [rsp],cx
+ movsx eax,byte [r10+3]
+ mov word [rsp+6],dx
+ mov word [rsp+4],dx
+ movdqa xmm11,xmm1
+ mov word [rsp+0Eh],ax
+ mov word [rsp+0Ch],ax
+ lea eax,[r11+r11]
+ movsxd rcx,eax
+ mov rax,rbx
+ mov rdx,rdi
+ sub rax,rcx
+ mov word [rsp+0Ah],r8w
+ mov word [rsp+8],r8w
+ movdqa xmm6,[rsp]
+ movdqa xmm7,xmm6
+ movq xmm13, [rax]
+ mov rax,rdi
+ sub rax,rcx
+ mov rcx,rbx
+ pcmpgtw xmm7,xmm1
+ psubw xmm11,xmm6
+ sub rcx,r11
+ sub rdx,r11
+ movq xmm0,[rax]
+ movsx eax,r9w
+ movq xmm15,[rcx]
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rdx]
+ movdqa xmm4,xmm13
+ punpcklqdq xmm15,xmm0
+ movq xmm0, [rdi]
+ punpcklbw xmm4,xmm1
+ movdqa xmm12,xmm15
+ punpcklqdq xmm2,xmm0
+ movq xmm0, [r11+rdi]
+ punpcklbw xmm12,xmm1
+ movdqa xmm14,xmm2
+ punpcklqdq xmm9,xmm0
+ punpckhbw xmm2,xmm1
+ punpcklbw xmm14,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm9
+ movdqa [rsp+10h],xmm2
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm9,xmm1
+ punpcklbw xmm3,xmm1
+ movdqa xmm1,xmm14
+ pshufd xmm10,xmm0,0
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ punpcklwd xmm0,xmm0
+ pshufd xmm8,xmm0,0
+ movd xmm0,eax
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ psubw xmm1,xmm12
+ movdqa xmm2,xmm10
+ lea r11,[rsp+0C8h]
+ psllw xmm1,2
+ movdqa xmm0,xmm4
+ psubw xmm4,xmm12
+ psubw xmm0,xmm3
+ psubw xmm3,xmm14
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm11
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm12
+ psubw xmm0,xmm14
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ movdqa xmm3,[rsp]
+ pand xmm2,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm13
+ pand xmm2,xmm1
+ psubw xmm0,xmm9
+ psubw xmm13,xmm15
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ paddw xmm12,xmm6
+ psubw xmm14,xmm6
+ movdqa xmm2,[rsp+10h]
+ movaps xmm6,[r11-18h]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm15
+ psubw xmm9,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm15
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ pmaxsw xmm11,xmm1
+ pabsw xmm0,xmm0
+ movdqa xmm1,xmm8
+ pcmpgtw xmm10,xmm0
+ pabsw xmm0,xmm13
+ pminsw xmm3,xmm11
+ movaps xmm11,[r11-68h]
+ movaps xmm13,[rsp+40h]
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm9
+ movaps xmm9, [r11-48h]
+ pand xmm10,xmm1
+ pcmpgtw xmm8,xmm0
+ pand xmm10,xmm8
+ pand xmm10,xmm7
+ movaps xmm8,[r11-38h]
+ movaps xmm7,[r11-28h]
+ pand xmm3,xmm10
+ paddw xmm15,xmm3
+ psubw xmm2,xmm3
+ movaps xmm10,[r11-58h]
+ packuswb xmm12,xmm15
+ movaps xmm15,[rsp+20h]
+ packuswb xmm14,xmm2
+ movq [rcx],xmm12
+ movq [rbx],xmm14
+ psrldq xmm12,8
+ psrldq xmm14,8
+ movq [rdx],xmm12
+ movaps xmm12,[r11-78h]
+ movq [rdi],xmm14
+ movaps xmm14,[rsp+30h]
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
+
+WELS_EXTERN DeblockChromaEq4V_sse2
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+
+ sub rsp,90h
+ pxor xmm1,xmm1
+ mov r11,rcx
+ mov rbx,rdx
+ mov r10d,r9d
+ movq xmm13,[r11]
+ lea eax,[r8+r8]
+ movsxd r9,eax
+ mov rax,rcx
+ sub rax,r9
+ movq xmm14,[rax]
+ mov rax,rdx
+ sub rax,r9
+ movq xmm0,[rax]
+ movsxd rax,r8d
+ sub rcx,rax
+ sub rdx,rax
+ movq xmm12,[rax+r11]
+ movq xmm10,[rcx]
+ punpcklqdq xmm14,xmm0
+ movdqa xmm8,xmm14
+ movq xmm0,[rdx]
+ punpcklbw xmm8,xmm1
+ punpckhbw xmm14,xmm1
+ punpcklqdq xmm10,xmm0
+ movq xmm0,[rbx]
+ movdqa xmm5,xmm10
+ punpcklqdq xmm13,xmm0
+ movq xmm0, [rax+rbx]
+ punpcklbw xmm5,xmm1
+ movsx eax,r10w
+ movdqa xmm9,xmm13
+ punpcklqdq xmm12,xmm0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm10,xmm1
+ movd xmm0,eax
+ mov eax, ebp ; iBeta
+ punpckhbw xmm13,xmm1
+ movdqa xmm7,xmm12
+ punpcklwd xmm0,xmm0
+ punpckhbw xmm12,xmm1
+ pshufd xmm11,xmm0,0
+ punpcklbw xmm7,xmm1
+ movd xmm0,eax
+ movdqa xmm1,xmm8
+ psubw xmm1,xmm5
+ punpcklwd xmm0,xmm0
+ movdqa xmm6,xmm11
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm5
+ psubw xmm0,xmm9
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm10
+ movdqa xmm1,xmm14
+ psubw xmm0,xmm13
+ psubw xmm1,xmm10
+ pabsw xmm0,xmm0
+ pcmpgtw xmm11,xmm0
+ pabsw xmm0,xmm1
+ pcmpgtw xmm2,xmm0
+ pand xmm11,xmm2
+ movdqa xmm0,xmm12
+ movdqa xmm4,xmm6
+ movdqa xmm1,xmm8
+ mov eax,2
+ cwde
+ paddw xmm1,xmm8
+ psubw xmm0,xmm13
+ paddw xmm1,xmm5
+ pabsw xmm0,xmm0
+ movdqa xmm2,xmm14
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm14
+ movd xmm0,eax
+ pand xmm11,xmm3
+ paddw xmm7,xmm7
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ paddw xmm2,xmm12
+ paddw xmm12,xmm12
+ pshufd xmm3,xmm0,0
+ paddw xmm7,xmm9
+ paddw xmm12,xmm13
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm5
+ paddw xmm7,xmm8
+ psraw xmm1,2
+ paddw xmm12,xmm14
+ paddw xmm7,xmm3
+ ;movaps xmm14,[rsp]
+ pand xmm4,xmm1
+ paddw xmm12,xmm3
+ psraw xmm7,2
+ movdqa xmm1,xmm11
+ por xmm4,xmm0
+ psraw xmm12,2
+ paddw xmm2,xmm3
+ movdqa xmm0,xmm11
+ pandn xmm0,xmm10
+ psraw xmm2,2
+ pand xmm1,xmm2
+ por xmm1,xmm0
+ packuswb xmm4,xmm1
+ movdqa xmm0,xmm11
+ movdqa xmm1,xmm6
+ pand xmm1,xmm7
+ movq [rcx],xmm4
+ pandn xmm6,xmm9
+ pandn xmm11,xmm13
+ pand xmm0,xmm12
+ por xmm1,xmm6
+ por xmm0,xmm11
+ psrldq xmm4,8
+ packuswb xmm1,xmm0
+ movq [r11],xmm1
+ psrldq xmm1,8
+ movq [rdx],xmm4
+ lea r11,[rsp+90h]
+ movq [rbx],xmm1
+ mov rsp,r11
+ pop rbp
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+ALIGN 16
+DeblockChromaEq4H_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+
+ mov rbp, r8
+ mov r8, rdx
+ mov r9, rcx
+ mov rcx, rdi
+ mov rdx, rsi
+ mov rdi, rdx
+
+ sub rsp,140h
+ lea eax,[r8*4]
+ movsxd r10,eax
+ mov eax,[rcx-2]
+ mov [rsp+10h],eax
+ lea rbx,[r10+rdx-2]
+ lea r11,[r10+rcx-2]
+
+ movdqa xmm5,[rsp+10h]
+ movsxd r10,r8d
+ mov eax,[r10+rcx-2]
+ lea rdx,[r10+r10*2]
+ mov [rsp+20h],eax
+ mov eax,[rcx+r10*2-2]
+ mov [rsp+30h],eax
+ mov eax,[rdx+rcx-2]
+ movdqa xmm2,[rsp+20h]
+ mov [rsp+40h],eax
+ mov eax, [rdi-2]
+ movdqa xmm4,[rsp+30h]
+ mov [rsp+50h],eax
+ mov eax,[r10+rdi-2]
+ movdqa xmm3,[rsp+40h]
+ mov [rsp+60h],eax
+ mov eax,[rdi+r10*2-2]
+ punpckldq xmm5,[rsp+50h]
+ mov [rsp+70h],eax
+ mov eax, [rdx+rdi-2]
+ punpckldq xmm2, [rsp+60h]
+ mov [rsp+80h],eax
+ mov eax,[r11]
+ punpckldq xmm4, [rsp+70h]
+ mov [rsp+50h],eax
+ mov eax,[rbx]
+ punpckldq xmm3,[rsp+80h]
+ mov [rsp+60h],eax
+ mov eax,[r10+r11]
+ movdqa xmm0, [rsp+50h]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm0,[rsp+50h]
+ movdqa xmm1,xmm5
+ mov [rsp+60h],eax
+ mov eax,[r11+r10*2]
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax,[rbx+r10*2]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ mov eax, [rdx+r11]
+ movdqa xmm15,xmm1
+ punpckldq xmm0,[rsp+60h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+50h],xmm0
+ mov [rsp+50h],eax
+ mov eax, [rdx+rbx]
+ movdqa xmm0,[rsp+50h]
+ mov [rsp+60h],eax
+ punpckldq xmm0, [rsp+60h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm15,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm12,xmm15
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm12,xmm0
+ punpckhdq xmm15,xmm0
+ movdqa xmm0,xmm1
+ movdqa xmm11,xmm12
+ punpckldq xmm0,xmm5
+ punpckhdq xmm1,xmm5
+ punpcklqdq xmm11,xmm0
+ punpckhqdq xmm12,xmm0
+ movsx eax,r9w
+ movdqa xmm14,xmm15
+ punpcklqdq xmm14,xmm1
+ punpckhqdq xmm15,xmm1
+ pxor xmm1,xmm1
+ movd xmm0,eax
+ movdqa xmm4,xmm12
+ movdqa xmm8,xmm11
+ mov eax, ebp ; iBeta
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm4,xmm1
+ punpckhbw xmm12,xmm1
+ movdqa xmm9,xmm14
+ movdqa xmm7,xmm15
+ movdqa xmm10,xmm15
+ pshufd xmm13,xmm0,0
+ punpcklbw xmm9,xmm1
+ punpckhbw xmm14,xmm1
+ movdqa xmm6,xmm13
+ movd xmm0,eax
+ movdqa [rsp],xmm11
+ mov eax,2
+ cwde
+ punpckhbw xmm11,xmm1
+ punpckhbw xmm10,xmm1
+ punpcklbw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ punpcklbw xmm8,xmm1
+ pshufd xmm3,xmm0,0
+ movdqa xmm1,xmm8
+ movdqa xmm0,xmm4
+ psubw xmm0,xmm9
+ psubw xmm1,xmm4
+ movdqa xmm2,xmm3
+ pabsw xmm0,xmm0
+ pcmpgtw xmm6,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm3
+ pcmpgtw xmm2,xmm0
+ pand xmm6,xmm2
+ movdqa xmm0,xmm7
+ movdqa xmm2,xmm3
+ psubw xmm0,xmm9
+ pabsw xmm0,xmm0
+ pcmpgtw xmm1,xmm0
+ pand xmm6,xmm1
+ movdqa xmm0,xmm12
+ movdqa xmm1,xmm11
+ psubw xmm0,xmm14
+ psubw xmm1,xmm12
+ movdqa xmm5,xmm6
+ pabsw xmm0,xmm0
+ pcmpgtw xmm13,xmm0
+ pabsw xmm0,xmm1
+ movdqa xmm1,xmm8
+ pcmpgtw xmm2,xmm0
+ paddw xmm1,xmm8
+ movdqa xmm0,xmm10
+ pand xmm13,xmm2
+ psubw xmm0,xmm14
+ paddw xmm1,xmm4
+ movdqa xmm2,xmm11
+ pabsw xmm0,xmm0
+ paddw xmm2,xmm11
+ paddw xmm1,xmm7
+ pcmpgtw xmm3,xmm0
+ paddw xmm2,xmm12
+ movd xmm0,eax
+ pand xmm13,xmm3
+ paddw xmm2,xmm10
+ punpcklwd xmm0,xmm0
+ pshufd xmm3,xmm0,0
+ movdqa xmm0,xmm6
+ paddw xmm1,xmm3
+ pandn xmm0,xmm4
+ paddw xmm2,xmm3
+ psraw xmm1,2
+ pand xmm5,xmm1
+ por xmm5,xmm0
+ paddw xmm7,xmm7
+ paddw xmm10,xmm10
+ psraw xmm2,2
+ movdqa xmm1,xmm13
+ movdqa xmm0,xmm13
+ pandn xmm0,xmm12
+ pand xmm1,xmm2
+ paddw xmm7,xmm9
+ por xmm1,xmm0
+ paddw xmm10,xmm14
+ paddw xmm7,xmm8
+ movdqa xmm0,xmm13
+ packuswb xmm5,xmm1
+ paddw xmm7,xmm3
+ paddw xmm10,xmm11
+ movdqa xmm1,xmm6
+ paddw xmm10,xmm3
+ pandn xmm6,xmm9
+ psraw xmm7,2
+ pand xmm1,xmm7
+ psraw xmm10,2
+ pandn xmm13,xmm14
+ pand xmm0,xmm10
+ por xmm1,xmm6
+ movdqa xmm6,[rsp]
+ movdqa xmm4,xmm6
+ por xmm0,xmm13
+ punpcklbw xmm4,xmm5
+ punpckhbw xmm6,xmm5
+ movdqa xmm3,xmm4
+ packuswb xmm1,xmm0
+ movdqa xmm0,xmm1
+ punpckhbw xmm1,xmm15
+ punpcklbw xmm0,xmm15
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm6
+ movdqa xmm2,xmm3
+ punpcklwd xmm0,xmm1
+ punpckhwd xmm6,xmm1
+ movdqa xmm1,xmm4
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm6
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm6
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+10h],xmm0
+ movdqa [rsp+60h],xmm2
+ movdqa xmm0,xmm3
+ mov eax,[rsp+10h]
+ mov [rcx-2],eax
+ mov eax,[rsp+60h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [r10+rcx-2],eax
+ movdqa [rsp+20h],xmm0
+ mov eax, [rsp+20h]
+ movdqa [rsp+70h],xmm3
+ mov [rcx+r10*2-2],eax
+ mov eax,[rsp+70h]
+ mov [rdx+rcx-2],eax
+ mov eax,[rsp+18h]
+ mov [r11],eax
+ mov eax,[rsp+68h]
+ mov [r10+r11],eax
+ mov eax,[rsp+28h]
+ mov [r11+r10*2],eax
+ mov eax,[rsp+78h]
+ mov [rdx+r11],eax
+ mov eax,[rsp+14h]
+ mov [rdi-2],eax
+ mov eax,[rsp+64h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+24h]
+ mov [rdi+r10*2-2],eax
+ mov eax, [rsp+74h]
+ mov [rdx+rdi-2],eax
+ mov eax, [rsp+1Ch]
+ mov [rbx],eax
+ mov eax, [rsp+6Ch]
+ mov [r10+rbx],eax
+ mov eax,[rsp+2Ch]
+ mov [rbx+r10*2],eax
+ mov eax,[rsp+7Ch]
+ mov [rdx+rbx],eax
+ lea r11,[rsp+140h]
+ mov rbx, [r11+28h]
+ mov rsp,r11
+ pop r12
+ pop rbp
+ pop rbx
+ ret
+
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+ALIGN 16
+DeblockChromaLt4H_sse2:
+ mov rax,rsp
+ push rbx
+ push rbp
+ push r12
+ push r13
+ push r14
+ sub rsp,170h
+
+ mov r13, r8
+ mov r14, r9
+ mov r8, rdx
+ mov r9, rcx
+ mov rdx, rdi
+ mov rcx, rsi
+
+ movsxd rsi,r8d
+ lea eax,[r8*4]
+ mov r11d,r9d
+ movsxd r10,eax
+ mov eax, [rcx-2]
+ mov r12,rdx
+ mov [rsp+40h],eax
+ mov eax, [rsi+rcx-2]
+ lea rbx,[r10+rcx-2]
+ movdqa xmm5,[rsp+40h]
+ mov [rsp+50h],eax
+ mov eax, [rcx+rsi*2-2]
+ lea rbp,[r10+rdx-2]
+ movdqa xmm2, [rsp+50h]
+ mov [rsp+60h],eax
+ lea r10,[rsi+rsi*2]
+ mov rdi,rcx
+ mov eax,[r10+rcx-2]
+ movdqa xmm4,[rsp+60h]
+ mov [rsp+70h],eax
+ mov eax,[rdx-2]
+ mov [rsp+80h],eax
+ mov eax, [rsi+rdx-2]
+ movdqa xmm3,[rsp+70h]
+ mov [rsp+90h],eax
+ mov eax,[rdx+rsi*2-2]
+ punpckldq xmm5,[rsp+80h]
+ mov [rsp+0A0h],eax
+ mov eax, [r10+rdx-2]
+ punpckldq xmm2,[rsp+90h]
+ mov [rsp+0B0h],eax
+ mov eax, [rbx]
+ punpckldq xmm4,[rsp+0A0h]
+ mov [rsp+80h],eax
+ mov eax,[rbp]
+ punpckldq xmm3,[rsp+0B0h]
+ mov [rsp+90h],eax
+ mov eax,[rsi+rbx]
+ movdqa xmm0,[rsp+80h]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm5,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rsi+rbp]
+ movdqa xmm0,[rsp+80h]
+ movdqa xmm1,xmm5
+ mov [rsp+90h],eax
+ mov eax,[rbx+rsi*2]
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm2,xmm0
+ punpcklbw xmm1,xmm2
+ punpckhbw xmm5,xmm2
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax,[rbp+rsi*2]
+ movdqa xmm0, [rsp+80h]
+ mov [rsp+90h],eax
+ mov eax,[r10+rbx]
+ movdqa xmm7,xmm1
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm4,xmm0
+ movdqa [rsp+80h],xmm0
+ mov [rsp+80h],eax
+ mov eax, [r10+rbp]
+ movdqa xmm0,[rsp+80h]
+ mov [rsp+90h],eax
+ punpckldq xmm0,[rsp+90h]
+ punpcklqdq xmm3,xmm0
+ movdqa xmm0,xmm4
+ punpcklbw xmm0,xmm3
+ punpckhbw xmm4,xmm3
+ punpcklwd xmm7,xmm0
+ punpckhwd xmm1,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm6,xmm7
+ punpcklwd xmm0,xmm4
+ punpckhwd xmm5,xmm4
+ punpckldq xmm6,xmm0
+ punpckhdq xmm7,xmm0
+ movdqa xmm0,xmm1
+ punpckldq xmm0,xmm5
+ mov rax, r14 ; pTC
+ punpckhdq xmm1,xmm5
+ movdqa xmm9,xmm6
+ punpckhqdq xmm6,xmm0
+ punpcklqdq xmm9,xmm0
+ movdqa xmm2,xmm7
+ movdqa xmm13,xmm6
+ movdqa xmm4,xmm9
+ movdqa [rsp+10h],xmm9
+ punpcklqdq xmm2,xmm1
+ punpckhqdq xmm7,xmm1
+ pxor xmm1,xmm1
+ movsx ecx,byte [rax+3]
+ movsx edx,byte [rax+2]
+ movsx r8d,byte [rax+1]
+ movsx r9d,byte [rax]
+ movdqa xmm10,xmm1
+ movdqa xmm15,xmm2
+ punpckhbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm4,xmm1
+ movsx eax,r11w
+ mov word [rsp+0Eh],cx
+ mov word [rsp+0Ch],cx
+ movdqa xmm3,xmm7
+ movdqa xmm8,xmm7
+ movdqa [rsp+20h],xmm7
+ punpcklbw xmm15,xmm1
+ punpcklbw xmm13,xmm1
+ punpcklbw xmm3,xmm1
+ mov word [rsp+0Ah],dx
+ mov word [rsp+8],dx
+ mov word [rsp+6],r8w
+ movd xmm0,eax
+ movdqa [rsp+30h],xmm6
+ punpckhbw xmm9,xmm1
+ punpckhbw xmm8,xmm1
+ punpcklwd xmm0,xmm0
+ mov eax, r13d ; iBeta
+ mov word [rsp+4],r8w
+ mov word [rsp+2],r9w
+ pshufd xmm12,xmm0,0
+ mov word [rsp],r9w
+ movd xmm0,eax
+ mov eax,4
+ cwde
+ movdqa xmm14, [rsp]
+ movdqa [rsp],xmm2
+ movdqa xmm2,xmm12
+ punpcklwd xmm0,xmm0
+ pshufd xmm11,xmm0,0
+ psubw xmm10,xmm14
+ movd xmm0,eax
+ movdqa xmm7,xmm14
+ movdqa xmm6,xmm14
+ pcmpgtw xmm7,xmm1
+ punpcklwd xmm0,xmm0
+ pshufd xmm5,xmm0,0
+ movdqa xmm0,xmm4
+ movdqa xmm1,xmm15
+ psubw xmm4,xmm13
+ psubw xmm0,xmm3
+ psubw xmm1,xmm13
+ psubw xmm3,xmm15
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm10
+ psraw xmm1,3
+ pmaxsw xmm0,xmm1
+ pminsw xmm6,xmm0
+ movdqa xmm1,xmm11
+ movdqa xmm0,xmm13
+ psubw xmm0,xmm15
+ pabsw xmm0,xmm0
+ pcmpgtw xmm2,xmm0
+ pabsw xmm0,xmm4
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm3
+ pand xmm2,xmm1
+ movdqa xmm1,xmm11
+ movdqa xmm3,[rsp+30h]
+ pcmpgtw xmm1,xmm0
+ movdqa xmm0,xmm9
+ pand xmm2,xmm1
+ psubw xmm0,xmm8
+ psubw xmm9,xmm3
+ pand xmm2,xmm7
+ pand xmm6,xmm2
+ psubw xmm15,xmm6
+ paddw xmm13,xmm6
+ movdqa xmm2,[rsp]
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ psubw xmm8,xmm2
+ psllw xmm1,2
+ paddw xmm1,xmm0
+ paddw xmm1,xmm5
+ movdqa xmm0,xmm3
+ movdqa xmm5,[rsp+10h]
+ psubw xmm0,xmm2
+ psraw xmm1,3
+ movdqa xmm4,xmm5
+ pabsw xmm0,xmm0
+ pmaxsw xmm10,xmm1
+ movdqa xmm1,xmm11
+ pcmpgtw xmm12,xmm0
+ pabsw xmm0,xmm9
+ pminsw xmm14,xmm10
+ pcmpgtw xmm1,xmm0
+ pabsw xmm0,xmm8
+ pcmpgtw xmm11,xmm0
+ pand xmm12,xmm1
+ movdqa xmm1,[rsp+20h]
+ pand xmm12,xmm11
+ pand xmm12,xmm7
+ pand xmm14,xmm12
+ paddw xmm3,xmm14
+ psubw xmm2,xmm14
+ packuswb xmm13,xmm3
+ packuswb xmm15,xmm2
+ punpcklbw xmm4,xmm13
+ punpckhbw xmm5,xmm13
+ movdqa xmm0,xmm15
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm15,xmm1
+ movdqa xmm3,xmm4
+ punpcklwd xmm3,xmm0
+ punpckhwd xmm4,xmm0
+ movdqa xmm0,xmm5
+ movdqa xmm2,xmm3
+ movdqa xmm1,xmm4
+ punpcklwd xmm0,xmm15
+ punpckhwd xmm5,xmm15
+ punpckldq xmm2,xmm0
+ punpckhdq xmm3,xmm0
+ punpckldq xmm1,xmm5
+ movdqa xmm0,xmm2
+ punpcklqdq xmm0,xmm1
+ punpckhdq xmm4,xmm5
+ punpckhqdq xmm2,xmm1
+ movdqa [rsp+40h],xmm0
+ movdqa xmm0,xmm3
+ movdqa [rsp+90h],xmm2
+ mov eax,[rsp+40h]
+ mov [rdi-2],eax
+ mov eax, [rsp+90h]
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm3,xmm4
+ mov [rsi+rdi-2],eax
+ movdqa [rsp+50h],xmm0
+ mov eax,[rsp+50h]
+ movdqa [rsp+0A0h],xmm3
+ mov [rdi+rsi*2-2],eax
+ mov eax,[rsp+0A0h]
+ mov [r10+rdi-2],eax
+ mov eax,[rsp+48h]
+ mov [rbx],eax
+ mov eax,[rsp+98h]
+ mov [rsi+rbx],eax
+ mov eax,[rsp+58h]
+ mov [rbx+rsi*2],eax
+ mov eax, [rsp+0A8h]
+ mov [r10+rbx],eax
+ mov eax, [rsp+44h]
+ mov [r12-2],eax
+ mov eax,[rsp+94h]
+ mov [rsi+r12-2],eax
+ mov eax,[rsp+54h]
+ mov [r12+rsi*2-2],eax
+ mov eax, [rsp+0A4h]
+ mov [r10+r12-2],eax
+ mov eax,[rsp+4Ch]
+ mov [rbp],eax
+ mov eax,[rsp+9Ch]
+ mov [rsi+rbp],eax
+ mov eax, [rsp+5Ch]
+ mov [rbp+rsi*2],eax
+ mov eax,[rsp+0ACh]
+ mov [r10+rbp],eax
+ lea r11,[rsp+170h]
+ mov rsp,r11
+ pop r14
+ pop r13
+ pop r12
+ pop rbp
+ pop rbx
+ ret
+
+
+
+%elifdef X86_32
+
+;********************************************************************************
+; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN DeblockChromaEq4V_sse2
+
+ALIGN 16
+DeblockChromaEq4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,68h
+ mov edx,[ebp+10h] ; iStride
+ mov eax,[ebp+8] ; pPixCb
+ mov ecx,[ebp+0Ch] ; pPixCr
+ movq xmm4,[ecx]
+ movq xmm5,[edx+ecx]
+ push esi
+ push edi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ movq xmm1,[edi]
+ mov edi,ecx
+ sub edi,esi
+ movq xmm2,[edi]
+ punpcklqdq xmm1,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm2,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm3,[edi]
+ punpcklqdq xmm2,xmm3
+ movq xmm3,[eax]
+ punpcklqdq xmm3,xmm4
+ movq xmm4,[edx+eax]
+ mov edx, [ebp + 14h]
+ punpcklqdq xmm4,xmm5
+ movd xmm5,edx
+ mov edx, [ebp + 18h]
+ pxor xmm0,xmm0
+ movdqa xmm6,xmm5
+ punpcklwd xmm6,xmm5
+ pshufd xmm5,xmm6,0
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,xmm1
+ punpckhbw xmm1,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+40h],xmm1
+ movdqa [esp+60h],xmm7
+ movdqa xmm7,xmm2
+ punpcklbw xmm7,xmm0
+ movdqa [esp+10h],xmm7
+ movdqa xmm7,xmm3
+ punpcklbw xmm7,xmm0
+ punpckhbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm7,xmm4
+ punpckhbw xmm4,xmm0
+ punpckhbw xmm2,xmm0
+ punpcklbw xmm7,xmm0
+ movdqa [esp+30h],xmm3
+ movdqa xmm3,[esp+10h]
+ movdqa xmm1,xmm3
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa [esp+20h],xmm4
+ movdqa xmm0,xmm5
+ pcmpgtw xmm0,xmm1
+ movdqa xmm1,[esp+60h]
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ pand xmm0,xmm4
+ movdqa xmm1,xmm7
+ psubw xmm1,[esp+50h]
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,xmm2
+ psubw xmm1,[esp+30h]
+ pabsw xmm1,xmm1
+ pcmpgtw xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ pand xmm0,xmm4
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ movdqa xmm4,xmm6
+ pcmpgtw xmm4,xmm1
+ movdqa xmm1,[esp+20h]
+ psubw xmm1,[esp+30h]
+ pand xmm5,xmm4
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ pand xmm5,xmm6
+ mov edx,2
+ movsx edx,dx
+ movd xmm1,edx
+ movdqa xmm4,xmm1
+ punpcklwd xmm4,xmm1
+ pshufd xmm1,xmm4,0
+ movdqa xmm4,[esp+60h]
+ movdqa xmm6,xmm4
+ paddw xmm6,xmm4
+ paddw xmm6,xmm3
+ paddw xmm6,xmm7
+ movdqa [esp+10h],xmm1
+ paddw xmm6,[esp+10h]
+ psraw xmm6,2
+ movdqa xmm4,xmm0
+ pandn xmm4,xmm3
+ movdqa xmm3,[esp+40h]
+ movdqa xmm1,xmm0
+ pand xmm1,xmm6
+ por xmm1,xmm4
+ movdqa xmm6,xmm3
+ paddw xmm6,xmm3
+ movdqa xmm3,[esp+10h]
+ paddw xmm6,xmm2
+ paddw xmm6,[esp+20h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm4,xmm5
+ pand xmm4,xmm6
+ movdqa xmm6,xmm5
+ pandn xmm6,xmm2
+ por xmm4,xmm6
+ packuswb xmm1,xmm4
+ movdqa xmm4,[esp+50h]
+ movdqa xmm6,xmm7
+ paddw xmm6,xmm7
+ paddw xmm6,xmm4
+ paddw xmm6,[esp+60h]
+ paddw xmm6,xmm3
+ psraw xmm6,2
+ movdqa xmm2,xmm0
+ pand xmm2,xmm6
+ pandn xmm0,xmm4
+ por xmm2,xmm0
+ movdqa xmm0,[esp+20h]
+ movdqa xmm6,xmm0
+ paddw xmm6,xmm0
+ movdqa xmm0,[esp+30h]
+ paddw xmm6,xmm0
+ paddw xmm6,[esp+40h]
+ movdqa xmm4,xmm5
+ paddw xmm6,xmm3
+ movq [esi],xmm1
+ psraw xmm6,2
+ pand xmm4,xmm6
+ pandn xmm5,xmm0
+ por xmm4,xmm5
+ packuswb xmm2,xmm4
+ movq [eax],xmm2
+ psrldq xmm1,8
+ movq [edi],xmm1
+ pop edi
+ psrldq xmm2,8
+ movq [ecx],xmm2
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0E4h
+ push ebx
+ push esi
+ mov esi, [ebp+1Ch] ; pTC
+ movsx ebx, byte [esi+2]
+ push edi
+ movsx di,byte [esi+3]
+ mov word [esp+0Ch],bx
+ movsx bx,byte [esi+1]
+ movsx esi,byte [esi]
+ mov word [esp+0Eh],si
+ movzx esi,di
+ movd xmm1,esi
+ movzx esi,di
+ movd xmm2,esi
+ mov si,word [esp+0Ch]
+ mov edx, [ebp + 10h]
+ mov eax, [ebp + 08h]
+ movzx edi,si
+ movzx esi,si
+ mov ecx, [ebp + 0Ch]
+ movd xmm4,esi
+ movzx esi,bx
+ movd xmm5,esi
+ movd xmm3,edi
+ movzx esi,bx
+ movd xmm6,esi
+ mov si,word [esp+0Eh]
+ movzx edi,si
+ movzx esi,si
+ punpcklwd xmm6,xmm2
+ pxor xmm0,xmm0
+ movdqa [esp+40h],xmm0
+ movd xmm7,edi
+ movd xmm0,esi
+ lea esi,[edx+edx]
+ mov edi,eax
+ sub edi,esi
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+40h]
+ punpcklwd xmm0,xmm4
+ movq xmm4,[edx+ecx]
+ punpcklwd xmm7,xmm3
+ movq xmm3,[eax]
+ punpcklwd xmm0,xmm6
+ movq xmm6,[edi]
+ punpcklwd xmm7,xmm5
+ punpcklwd xmm0,xmm7
+ mov edi,ecx
+ sub edi,esi
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+60h],xmm2
+ movq xmm2, [edi]
+ punpcklqdq xmm6,xmm2
+ mov esi,eax
+ sub esi,edx
+ movq xmm7,[esi]
+ mov edi,ecx
+ sub edi,edx
+ movq xmm2,[edi]
+ punpcklqdq xmm7,xmm2
+ movq xmm2,[ecx]
+ punpcklqdq xmm3,xmm2
+ movq xmm2,[edx+eax]
+ movsx edx,word [ebp + 14h]
+ punpcklqdq xmm2,xmm4
+ movdqa [esp+0E0h],xmm2
+ movd xmm2,edx
+ movsx edx,word [ebp + 18h]
+ movdqa xmm4,xmm2
+ punpcklwd xmm4,xmm2
+ movd xmm2,edx
+ movdqa xmm5,xmm2
+ punpcklwd xmm5,xmm2
+ pshufd xmm2,xmm5,0
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ movdqa [esp+0D0h],xmm3
+ pshufd xmm4,xmm4,0
+ movdqa [esp+30h],xmm2
+ punpckhbw xmm6,xmm1
+ movdqa [esp+80h],xmm6
+ movdqa xmm6,[esp+0D0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+70h],xmm6
+ movdqa xmm6, [esp+0E0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+90h],xmm6
+ movdqa xmm5, [esp+0E0h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0A0h],xmm7
+ punpcklbw xmm3,xmm1
+ mov edx,4
+ punpcklbw xmm2,xmm1
+ movsx edx,dx
+ movd xmm6,edx
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa xmm7,[esp+30h]
+ movdqa [esp+20h],xmm6
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1,[esp+60h]
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6, [esp+20h]
+ movdqa xmm7, [esp+50h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa [esp+10h],xmm0
+ movdqa xmm6, [esp+10h]
+ pminsw xmm6,xmm1
+ movdqa [esp+10h],xmm6
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ movdqa xmm6,xmm4
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+30h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1,[esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5,[esp+80h]
+ psubw xmm5,[esp+90h]
+ pand xmm6,xmm1
+ pand xmm6,[esp+40h]
+ movdqa xmm1,[esp+10h]
+ pand xmm1,xmm6
+ movdqa xmm6,[esp+70h]
+ movdqa [esp+30h],xmm1
+ movdqa xmm1,[esp+0A0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6,[esp+20h]
+ movdqa xmm5,[esp+60h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+70h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+80h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+90h]
+ pand xmm4,xmm7
+ movdqa xmm7,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+40h]
+ pand xmm0,xmm4
+ movdqa xmm4,[esp+30h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ packuswb xmm2,xmm1
+ movq [esi],xmm2
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm3,xmm5
+ movq [eax],xmm3
+ psrldq xmm2,8
+ movq [edi],xmm2
+ pop edi
+ pop esi
+ psrldq xmm3,8
+ movq [ecx],xmm3
+ pop ebx
+ mov esp,ebp
+ pop ebp
+ ret
+
+;***************************************************************************
+; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN DeblockChromaEq4H_sse2
+
+ALIGN 16
+
+DeblockChromaEq4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,0C8h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+18h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+7Ch]
+ push edi
+ mov dword [esp+14h],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+0Ch],edx
+ mov dword [esp+10h],eax
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+0Ch]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+10h]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ movsx ecx,word [ebp+14h]
+ movsx edx,word [ebp+18h]
+ movdqa xmm6,[esp+80h]
+ movdqa xmm4,[esp+90h]
+ movdqa xmm5,[esp+0A0h]
+ movdqa xmm7,[esp+0B0h]
+ pxor xmm0,xmm0
+ movd xmm1,ecx
+ movdqa xmm2,xmm1
+ punpcklwd xmm2,xmm1
+ pshufd xmm1,xmm2,0
+ movd xmm2,edx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3,xmm6
+ punpckhbw xmm6,xmm0
+ movdqa [esp+60h],xmm6
+ movdqa xmm6,[esp+90h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+30h],xmm6
+ movdqa xmm6,[esp+0A0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+40h],xmm6
+ movdqa xmm6,[esp+0B0h]
+ punpckhbw xmm6,xmm0
+ movdqa [esp+70h],xmm6
+ punpcklbw xmm7,xmm0
+ punpcklbw xmm4,xmm0
+ punpcklbw xmm5,xmm0
+ punpcklbw xmm3,xmm0
+ movdqa [esp+50h],xmm7
+ movdqa xmm6,xmm4
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ movdqa xmm0,xmm1
+ pcmpgtw xmm0,xmm6
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm4
+ pabsw xmm6,xmm6
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+30h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pcmpgtw xmm1,xmm6
+ movdqa xmm6,[esp+60h]
+ psubw xmm6,[esp+30h]
+ pabsw xmm6,xmm6
+ pand xmm0,xmm7
+ movdqa xmm7,xmm2
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6,[esp+70h]
+ psubw xmm6,[esp+40h]
+ pabsw xmm6,xmm6
+ pand xmm1,xmm7
+ pcmpgtw xmm2,xmm6
+ pand xmm1,xmm2
+ mov eax,2
+ movsx ecx,ax
+ movd xmm2,ecx
+ movdqa xmm6,xmm2
+ punpcklwd xmm6,xmm2
+ pshufd xmm2,xmm6,0
+ movdqa [esp+20h],xmm2
+ movdqa xmm2,xmm3
+ paddw xmm2,xmm3
+ paddw xmm2,xmm4
+ paddw xmm2,[esp+50h]
+ paddw xmm2,[esp+20h]
+ psraw xmm2,2
+ movdqa xmm6,xmm0
+ pand xmm6,xmm2
+ movdqa xmm2,xmm0
+ pandn xmm2,xmm4
+ por xmm6,xmm2
+ movdqa xmm2,[esp+60h]
+ movdqa xmm7,xmm2
+ paddw xmm7,xmm2
+ paddw xmm7,[esp+30h]
+ paddw xmm7,[esp+70h]
+ paddw xmm7,[esp+20h]
+ movdqa xmm4,xmm1
+ movdqa xmm2,xmm1
+ pandn xmm2,[esp+30h]
+ psraw xmm7,2
+ pand xmm4,xmm7
+ por xmm4,xmm2
+ movdqa xmm2,[esp+50h]
+ packuswb xmm6,xmm4
+ movdqa [esp+90h],xmm6
+ movdqa xmm6,xmm2
+ paddw xmm6,xmm2
+ movdqa xmm2,[esp+20h]
+ paddw xmm6,xmm5
+ paddw xmm6,xmm3
+ movdqa xmm4,xmm0
+ pandn xmm0,xmm5
+ paddw xmm6,xmm2
+ psraw xmm6,2
+ pand xmm4,xmm6
+ por xmm4,xmm0
+ movdqa xmm0,[esp+70h]
+ movdqa xmm5,xmm0
+ paddw xmm5,xmm0
+ movdqa xmm0,[esp+40h]
+ paddw xmm5,xmm0
+ paddw xmm5,[esp+60h]
+ movdqa xmm3,xmm1
+ paddw xmm5,xmm2
+ psraw xmm5,2
+ pand xmm3,xmm5
+ pandn xmm1,xmm0
+ por xmm3,xmm1
+ packuswb xmm4,xmm3
+ movdqa [esp+0A0h],xmm4
+ mov esi,dword [esp+10h]
+ movdqa xmm0,[esi]
+ movdqa xmm1,[esi+10h]
+ movdqa xmm2,[esi+20h]
+ movdqa xmm3,[esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+1Ch]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+14h]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+0Ch]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+;*******************************************************************************
+; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN DeblockChromaLt4H_sse2
+
+ALIGN 16
+
+DeblockChromaLt4H_sse2:
+ push ebp
+ mov ebp,esp
+ and esp,0FFFFFFF0h
+ sub esp,108h
+ mov ecx,dword [ebp+8]
+ mov edx,dword [ebp+0Ch]
+ mov eax,dword [ebp+10h]
+ sub ecx,2
+ sub edx,2
+ push esi
+ lea esi,[eax+eax*2]
+ mov dword [esp+10h],ecx
+ mov dword [esp+4],edx
+ lea ecx,[ecx+eax*4]
+ lea edx,[edx+eax*4]
+ lea eax,[esp+6Ch]
+ push edi
+ mov dword [esp+0Ch],esi
+ mov dword [esp+18h],ecx
+ mov dword [esp+10h],edx
+ mov dword [esp+1Ch],eax
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ movd xmm0,dword [esi]
+ movd xmm1,dword [esi+ecx]
+ movd xmm2,dword [esi+ecx*2]
+ movd xmm3,dword [esi+edx]
+ mov esi,dword [esp+8]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [esi+ecx]
+ movd xmm6,dword [esi+ecx*2]
+ movd xmm7,dword [esi+edx]
+ punpckldq xmm0,xmm4
+ punpckldq xmm1,xmm5
+ punpckldq xmm2,xmm6
+ punpckldq xmm3,xmm7
+ mov esi,dword [esp+18h]
+ mov edi,dword [esp+10h]
+ movd xmm4,dword [esi]
+ movd xmm5,dword [edi]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm0,xmm4
+ movd xmm4,dword [esi+ecx]
+ movd xmm5,dword [edi+ecx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm1,xmm4
+ movd xmm4,dword [esi+ecx*2]
+ movd xmm5,dword [edi+ecx*2]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm2,xmm4
+ movd xmm4,dword [esi+edx]
+ movd xmm5,dword [edi+edx]
+ punpckldq xmm4,xmm5
+ punpcklqdq xmm3,xmm4
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov edi,dword [esp+1Ch]
+ movdqa [edi],xmm0
+ movdqa [edi+10h],xmm5
+ movdqa [edi+20h],xmm1
+ movdqa [edi+30h],xmm6
+ mov eax,dword [ebp+1Ch]
+ movsx cx,byte [eax+3]
+ movsx dx,byte [eax+2]
+ movsx si,byte [eax+1]
+ movsx ax,byte [eax]
+ movzx edi,cx
+ movzx ecx,cx
+ movd xmm2,ecx
+ movzx ecx,dx
+ movzx edx,dx
+ movd xmm3,ecx
+ movd xmm4,edx
+ movzx ecx,si
+ movzx edx,si
+ movd xmm5,ecx
+ pxor xmm0,xmm0
+ movd xmm6,edx
+ movzx ecx,ax
+ movdqa [esp+60h],xmm0
+ movzx edx,ax
+ movsx eax,word [ebp+14h]
+ punpcklwd xmm6,xmm2
+ movd xmm1,edi
+ movd xmm7,ecx
+ movsx ecx,word [ebp+18h]
+ movd xmm0,edx
+ punpcklwd xmm7,xmm3
+ punpcklwd xmm5,xmm1
+ movdqa xmm1,[esp+60h]
+ punpcklwd xmm7,xmm5
+ movdqa xmm5,[esp+0A0h]
+ punpcklwd xmm0,xmm4
+ punpcklwd xmm0,xmm6
+ movdqa xmm6, [esp+70h]
+ punpcklwd xmm0,xmm7
+ movdqa xmm7,[esp+80h]
+ movdqa xmm2,xmm1
+ psubw xmm2,xmm0
+ movdqa [esp+0D0h],xmm2
+ movd xmm2,eax
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm4,xmm3,0
+ movd xmm2,ecx
+ movdqa xmm3,xmm2
+ punpcklwd xmm3,xmm2
+ pshufd xmm2,xmm3,0
+ movdqa xmm3, [esp+90h]
+ movdqa [esp+50h],xmm2
+ movdqa xmm2,xmm6
+ punpcklbw xmm2,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa [esp+40h],xmm2
+ movdqa [esp+0B0h],xmm6
+ movdqa xmm6,[esp+90h]
+ movdqa xmm2,xmm7
+ punpckhbw xmm7,xmm1
+ punpckhbw xmm6,xmm1
+ punpcklbw xmm2,xmm1
+ punpcklbw xmm3,xmm1
+ punpcklbw xmm5,xmm1
+ movdqa [esp+0F0h],xmm7
+ movdqa [esp+0C0h],xmm6
+ movdqa xmm6, [esp+0A0h]
+ punpckhbw xmm6,xmm1
+ movdqa [esp+0E0h],xmm6
+ mov edx,4
+ movsx eax,dx
+ movd xmm6,eax
+ movdqa xmm7,xmm6
+ punpcklwd xmm7,xmm6
+ pshufd xmm6,xmm7,0
+ movdqa [esp+30h],xmm6
+ movdqa xmm7, [esp+40h]
+ psubw xmm7,xmm5
+ movdqa xmm6,xmm0
+ pcmpgtw xmm6,xmm1
+ movdqa [esp+60h],xmm6
+ movdqa xmm1, [esp+0D0h]
+ movdqa xmm6,xmm3
+ psubw xmm6,xmm2
+ psllw xmm6,2
+ paddw xmm6,xmm7
+ paddw xmm6,[esp+30h]
+ psraw xmm6,3
+ pmaxsw xmm1,xmm6
+ movdqa xmm7,[esp+50h]
+ movdqa [esp+20h],xmm0
+ movdqa xmm6, [esp+20h]
+ pminsw xmm6,xmm1
+ movdqa [esp+20h],xmm6
+ movdqa xmm6,xmm4
+ movdqa xmm1,xmm2
+ psubw xmm1,xmm3
+ pabsw xmm1,xmm1
+ pcmpgtw xmm6,xmm1
+ movdqa xmm1, [esp+40h]
+ psubw xmm1,xmm2
+ pabsw xmm1,xmm1
+ pcmpgtw xmm7,xmm1
+ movdqa xmm1, [esp+50h]
+ pand xmm6,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm5,xmm3
+ pabsw xmm5,xmm5
+ pcmpgtw xmm1,xmm5
+ movdqa xmm5, [esp+0B0h]
+ psubw xmm5,[esp+0E0h]
+ pand xmm6,xmm1
+ pand xmm6, [esp+60h]
+ movdqa xmm1, [esp+20h]
+ pand xmm1,xmm6
+ movdqa xmm6, [esp+0C0h]
+ movdqa [esp+40h],xmm1
+ movdqa xmm1, [esp+0F0h]
+ psubw xmm6,xmm1
+ psllw xmm6,2
+ paddw xmm6,xmm5
+ paddw xmm6, [esp+30h]
+ movdqa xmm5, [esp+0D0h]
+ psraw xmm6,3
+ pmaxsw xmm5,xmm6
+ pminsw xmm0,xmm5
+ movdqa xmm5,[esp+0C0h]
+ movdqa xmm6,xmm1
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm4,xmm6
+ movdqa xmm6,[esp+0B0h]
+ psubw xmm6,xmm1
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ movdqa xmm6, [esp+0E0h]
+ pand xmm4,xmm7
+ movdqa xmm7, [esp+50h]
+ psubw xmm6,xmm5
+ pabsw xmm6,xmm6
+ pcmpgtw xmm7,xmm6
+ pand xmm4,xmm7
+ pand xmm4,[esp+60h]
+ pand xmm0,xmm4
+ movdqa xmm4, [esp+40h]
+ paddw xmm2,xmm4
+ paddw xmm1,xmm0
+ psubw xmm3,xmm4
+ psubw xmm5,xmm0
+ packuswb xmm2,xmm1
+ packuswb xmm3,xmm5
+ movdqa [esp+80h],xmm2
+ movdqa [esp+90h],xmm3
+ mov esi,dword [esp+1Ch]
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi+10h]
+ movdqa xmm2, [esi+20h]
+ movdqa xmm3, [esi+30h]
+ movdqa xmm6,xmm0
+ punpcklbw xmm0,xmm1
+ punpckhbw xmm6,xmm1
+ movdqa xmm7,xmm2
+ punpcklbw xmm2,xmm3
+ punpckhbw xmm7,xmm3
+ movdqa xmm4,xmm0
+ movdqa xmm5,xmm6
+ punpcklwd xmm0,xmm2
+ punpckhwd xmm4,xmm2
+ punpcklwd xmm6,xmm7
+ punpckhwd xmm5,xmm7
+ movdqa xmm1,xmm0
+ movdqa xmm2,xmm4
+ punpckldq xmm0,xmm6
+ punpckhdq xmm1,xmm6
+ punpckldq xmm4,xmm5
+ punpckhdq xmm2,xmm5
+ movdqa xmm5,xmm0
+ movdqa xmm6,xmm1
+ punpcklqdq xmm0,xmm4
+ punpckhqdq xmm5,xmm4
+ punpcklqdq xmm1,xmm2
+ punpckhqdq xmm6,xmm2
+ mov esi,dword [esp+14h]
+ mov ecx,dword [ebp+10h]
+ mov edx,dword [esp+0Ch]
+ mov edi,dword [esp+8]
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov esi,dword [esp+18h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ movd dword [esi],xmm0
+ movd dword [esi+ecx],xmm5
+ movd dword [esi+ecx*2],xmm1
+ movd dword [esi+edx],xmm6
+ psrldq xmm0,4
+ psrldq xmm5,4
+ psrldq xmm1,4
+ psrldq xmm6,4
+ mov edi,dword [esp+10h]
+ movd dword [edi],xmm0
+ movd dword [edi+ecx],xmm5
+ movd dword [edi+ecx*2],xmm1
+ movd dword [edi+edx],xmm6
+ pop edi
+ pop esi
+ mov esp,ebp
+ pop ebp
+ ret
+
+
+
+;*******************************************************************************
+; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+
+
+WELS_EXTERN DeblockLumaLt4V_sse2
+
+ALIGN 16
+
+DeblockLumaLt4V_sse2:
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 420 ; 000001a4H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+
+ pxor xmm0, xmm0
+ push ebx
+ mov edx, dword [ebp+24]
+ movdqa [esp+424-384], xmm0
+ push esi
+
+ lea esi, [ecx+ecx*2]
+ push edi
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+
+ lea esi, [ecx+ecx]
+ movdqa [esp+432-208], xmm0
+ mov edi, eax
+ sub edi, esi
+ movdqa xmm0, [edi]
+ movdqa [esp+448-208], xmm0
+
+ mov ebx, eax
+ sub ebx, ecx
+ movdqa xmm0, [ebx]
+ movdqa [esp+464-208], xmm0
+
+ movdqa xmm0, [eax]
+
+ add ecx, eax
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [ecx]
+ mov dword [esp+432-404], ecx
+
+ movsx ecx, word [ebp+16]
+ movdqa [esp+496-208], xmm0
+ movdqa xmm0, [esi+eax]
+
+ movsx si, byte [edx]
+ movdqa [esp+512-208], xmm0
+ movd xmm0, ecx
+ movsx ecx, word [ebp+20]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ pshufd xmm0, xmm1, 0
+ movdqa [esp+432-112], xmm0
+ movd xmm0, ecx
+ movsx cx, byte [edx+1]
+ movdqa xmm1, xmm0
+ punpcklwd xmm1, xmm0
+ mov dword [esp+432-408], ebx
+ movzx ebx, cx
+ pshufd xmm0, xmm1, 0
+ movd xmm1, ebx
+ movzx ebx, cx
+ movd xmm2, ebx
+ movzx ebx, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, si
+ movd xmm5, ecx
+ movzx ecx, si
+ movd xmm6, ecx
+ movzx ecx, si
+ movd xmm7, ecx
+ movzx ecx, si
+ movdqa [esp+432-336], xmm0
+ movd xmm0, ecx
+
+ movsx cx, byte [edx+3]
+ movsx dx, byte [edx+2]
+ movd xmm3, ebx
+ punpcklwd xmm0, xmm4
+ movzx esi, cx
+ punpcklwd xmm6, xmm2
+ punpcklwd xmm5, xmm1
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ punpcklwd xmm0, xmm7
+ movdqa [esp+432-400], xmm0
+ movd xmm0, esi
+ movzx esi, cx
+ movd xmm2, esi
+ movzx esi, cx
+ movzx ecx, cx
+ movd xmm4, ecx
+ movzx ecx, dx
+ movd xmm3, esi
+ movd xmm5, ecx
+ punpcklwd xmm5, xmm0
+
+ movdqa xmm0, [esp+432-384]
+ movzx ecx, dx
+ movd xmm6, ecx
+ movzx ecx, dx
+ movzx edx, dx
+ punpcklwd xmm6, xmm2
+ movd xmm7, ecx
+ movd xmm1, edx
+
+ movdqa xmm2, [esp+448-208]
+ punpcklbw xmm2, xmm0
+
+ mov ecx, 4
+ movsx edx, cx
+ punpcklwd xmm7, xmm3
+ punpcklwd xmm7, xmm5
+ movdqa xmm5, [esp+496-208]
+ movdqa xmm3, [esp+464-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-240], xmm5
+ movdqa xmm5, [esp+512-208]
+ punpcklbw xmm5, xmm0
+ movdqa [esp+432-352], xmm5
+ punpcklwd xmm1, xmm4
+ movdqa xmm4, [esp+432-208]
+ punpcklwd xmm1, xmm6
+ movdqa xmm6, [esp+480-208]
+ punpcklwd xmm1, xmm7
+ punpcklbw xmm6, xmm0
+ punpcklbw xmm3, xmm0
+ punpcklbw xmm4, xmm0
+ movdqa xmm7, xmm3
+ psubw xmm7, xmm4
+ pabsw xmm7, xmm7
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-336]
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-352]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+ movdqa xmm5, xmm3
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+ movdqa xmm5, [esp+432-400]
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, xmm3
+ movdqa [esp+432-32], xmm6
+ psubw xmm6, [esp+432-240]
+ movdqa xmm7, xmm5
+ movdqa [esp+432-384], xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+
+ pand xmm5, xmm7
+ movdqa xmm6, xmm3
+ psubw xmm6, xmm2
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm4
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-400]
+ pand xmm5, xmm7
+ movdqa xmm7, xmm6
+ pcmpeqw xmm6, xmm0
+ pcmpgtw xmm7, xmm0
+ por xmm7, xmm6
+ pand xmm5, xmm7
+ movdqa [esp+432-320], xmm5
+ movd xmm5, edx
+ movdqa xmm6, xmm5
+ punpcklwd xmm6, xmm5
+ pshufd xmm5, xmm6, 0
+ movdqa [esp+432-336], xmm5
+ movdqa xmm5, [esp+432-224]
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm0
+ psubw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ psllw xmm5, 2
+ movdqa xmm7, xmm2
+ psubw xmm7, [esp+432-240]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ psraw xmm7, 3
+ pmaxsw xmm6, xmm7
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ movdqa xmm6, [esp+432-400]
+ movdqa [esp+432-64], xmm5
+ movdqa [esp+432-384], xmm6
+ movdqa xmm5, xmm0
+ psubw xmm5, xmm6
+ movdqa [esp+432-368], xmm5
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm2
+ psubw xmm5, xmm7
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-384]
+ pminsw xmm5, xmm6
+
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-288]
+ movdqa xmm6, [esp+432-240]
+ movdqa [esp+432-96], xmm5
+ movdqa xmm5, [esp+432-352]
+ paddw xmm5, [esp+432-304]
+ movdqa xmm7, xmm6
+ paddw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+ psubw xmm5, xmm7
+
+ movdqa xmm7, [esp+496-208]
+ psraw xmm5, 1
+ pmaxsw xmm6, xmm5
+ movdqa xmm5, [esp+432-400]
+ pminsw xmm5, xmm6
+ pand xmm5, [esp+432-320]
+ pand xmm5, [esp+432-256]
+ movdqa xmm6, [esp+448-208]
+ punpckhbw xmm7, xmm0
+ movdqa [esp+432-352], xmm7
+
+ movdqa xmm7, [esp+512-208]
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-48], xmm5
+ movdqa xmm5, [esp+432-208]
+ movdqa [esp+432-368], xmm6
+ movdqa xmm6, [esp+464-208]
+ punpckhbw xmm7, xmm0
+ punpckhbw xmm5, xmm0
+ movdqa [esp+432-384], xmm7
+ punpckhbw xmm6, xmm0
+ movdqa [esp+432-400], xmm6
+
+ movdqa xmm7, [esp+432-400]
+ movdqa xmm6, [esp+480-208]
+ psubw xmm7, xmm5
+ movdqa [esp+432-16], xmm5
+ pabsw xmm7, xmm7
+ punpckhbw xmm6, xmm0
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-288], xmm5
+
+ movdqa xmm7, xmm6
+ psubw xmm7, [esp+432-384]
+ pabsw xmm7, xmm7
+ movdqa xmm5, xmm4
+ pcmpgtw xmm5, xmm7
+ movdqa [esp+432-256], xmm5
+
+ movdqa xmm5, [esp+432-400]
+ movdqa [esp+432-80], xmm6
+ pavgw xmm5, xmm6
+ movdqa [esp+432-304], xmm5
+
+ movdqa xmm5, xmm1
+ psubw xmm5, [esp+432-288]
+ psubw xmm5, [esp+432-256]
+ movdqa [esp+432-224], xmm5
+ movdqa xmm5, xmm6
+ psubw xmm5, [esp+432-400]
+ psubw xmm6, [esp+432-352]
+ movdqa [esp+432-272], xmm5
+ movdqa xmm7, xmm5
+ movdqa xmm5, [esp+432-112]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm5, xmm7
+ movdqa xmm7, xmm4
+ pabsw xmm6, xmm6
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+432-368]
+
+ pand xmm5, xmm7
+ movdqa xmm7, [esp+432-400]
+ psubw xmm7, xmm6
+ psubw xmm6, [esp+432-352]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+ pand xmm5, xmm4
+
+ paddw xmm2, [esp+432-96]
+ movdqa xmm4, xmm1
+ pcmpgtw xmm4, xmm0
+ movdqa xmm7, xmm1
+ pcmpeqw xmm7, xmm0
+ por xmm4, xmm7
+ pand xmm5, xmm4
+ movdqa xmm4, [esp+432-224]
+ movdqa [esp+432-320], xmm5
+ movdqa xmm5, [esp+432-272]
+ movdqa xmm7, xmm0
+ psubw xmm7, xmm4
+ psubw xmm0, xmm1
+ psllw xmm5, 2
+ paddw xmm6, xmm5
+ paddw xmm6, [esp+432-336]
+ movdqa xmm5, [esp+432-368]
+ movdqa [esp+432-336], xmm0
+ psraw xmm6, 3
+ pmaxsw xmm7, xmm6
+ pminsw xmm4, xmm7
+ pand xmm4, [esp+432-320]
+ movdqa xmm6, xmm0
+ movdqa xmm0, [esp+432-16]
+ paddw xmm0, [esp+432-304]
+ movdqa [esp+432-272], xmm4
+ movdqa xmm4, [esp+432-368]
+ paddw xmm4, xmm4
+ psubw xmm0, xmm4
+
+ movdqa xmm4, [esp+432-64]
+ psraw xmm0, 1
+ pmaxsw xmm6, xmm0
+ movdqa xmm0, [esp+432-400]
+ movdqa xmm7, xmm1
+ pminsw xmm7, xmm6
+ movdqa xmm6, [esp+432-320]
+ pand xmm7, xmm6
+ pand xmm7, [esp+432-288]
+ paddw xmm5, xmm7
+ packuswb xmm2, xmm5
+ movdqa xmm5, [esp+432-272]
+ paddw xmm0, xmm5
+ paddw xmm3, xmm4
+ packuswb xmm3, xmm0
+
+ movdqa xmm0, [esp+432-32]
+ psubw xmm0, xmm4
+ movdqa xmm4, [esp+432-80]
+ psubw xmm4, xmm5
+
+ movdqa xmm5, [esp+432-240]
+ paddw xmm5, [esp+432-48]
+ packuswb xmm0, xmm4
+ movdqa xmm4, [esp+432-384]
+ paddw xmm4, [esp+432-304]
+ movdqa [esp+480-208], xmm0
+ movdqa xmm0, [esp+432-352]
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm0
+
+ mov ecx, dword [esp+432-408]
+
+ mov edx, dword [esp+432-404]
+ psubw xmm4, xmm0
+ movdqa xmm0, [esp+432-336]
+ movdqa [edi], xmm2
+ psraw xmm4, 1
+ pmaxsw xmm0, xmm4
+ pminsw xmm1, xmm0
+ movdqa xmm0, [esp+480-208]
+
+ pop edi
+ pand xmm1, xmm6
+ pand xmm1, [esp+428-256]
+ movdqa [ecx], xmm3
+ paddw xmm7, xmm1
+ pop esi
+ packuswb xmm5, xmm7
+ movdqa [eax], xmm0
+ movdqa [edx], xmm5
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+
+;*******************************************************************************
+; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
+; int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN DeblockLumaEq4V_sse2
+
+ALIGN 16
+
+DeblockLumaEq4V_sse2:
+
+ push ebp
+ mov ebp, esp
+ and esp, -16 ; fffffff0H
+ sub esp, 628 ; 00000274H
+ mov eax, dword [ebp+8]
+ mov ecx, dword [ebp+12]
+ push ebx
+ push esi
+
+ lea edx, [ecx*4]
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm0
+
+ movdqa xmm0, [ecx+eax]
+ mov esi, eax
+ sub esi, edx
+ movdqa xmm3, [esi]
+ movdqa xmm5, [eax]
+ push edi
+ lea edi, [ecx+ecx]
+ lea ebx, [ecx+ecx*2]
+ mov dword [esp+640-600], edi
+ mov esi, eax
+ sub esi, edi
+ movdqa xmm1, [esi]
+ movdqa [esp+720-272], xmm0
+ mov edi, eax
+ sub edi, ecx
+ movdqa xmm4, [edi]
+ add ecx, eax
+ mov dword [esp+640-596], ecx
+
+ mov ecx, dword [esp+640-600]
+ movdqa xmm0, [ecx+eax]
+ movdqa [esp+736-272], xmm0
+
+ movdqa xmm0, [eax+ebx]
+ mov edx, eax
+ sub edx, ebx
+
+ movsx ebx, word [ebp+16]
+ movdqa xmm6, [edx]
+ add ecx, eax
+ movdqa [esp+752-272], xmm0
+ movd xmm0, ebx
+
+ movsx ebx, word [ebp+20]
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+ movdqa [esp+640-320], xmm0
+ movd xmm0, ebx
+ movdqa xmm7, xmm0
+ punpcklwd xmm7, xmm0
+ pshufd xmm0, xmm7, 0
+
+ movdqa xmm7, [esp+736-272]
+ punpcklbw xmm7, xmm2
+ movdqa [esp+640-416], xmm7
+ movdqa [esp+640-512], xmm0
+ movdqa xmm0, xmm1
+ movdqa [esp+672-272], xmm1
+ movdqa xmm1, xmm4
+ movdqa [esp+704-272], xmm5
+ punpcklbw xmm5, xmm2
+ punpcklbw xmm1, xmm2
+
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ punpcklbw xmm0, xmm2
+ movdqa [esp+688-272], xmm4
+ movdqa xmm4, [esp+720-272]
+ movdqa [esp+640-480], xmm0
+
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm0
+
+ movdqa xmm0, [esp+640-512]
+ pabsw xmm7, xmm7
+ punpcklbw xmm4, xmm2
+ pcmpgtw xmm0, xmm7
+ movdqa [esp+640-384], xmm4
+ movdqa xmm7, xmm5
+ psubw xmm7, xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+656-272], xmm6
+ punpcklbw xmm6, xmm2
+ pabsw xmm7, xmm7
+ movdqa [esp+640-48], xmm2
+ movdqa [esp+640-368], xmm6
+ movdqa [esp+640-144], xmm1
+ movdqa [esp+640-400], xmm5
+ pcmpgtw xmm4, xmm7
+ pand xmm0, xmm4
+ movdqa xmm4, [esp+640-320]
+ pcmpgtw xmm4, [esp+640-560]
+ pand xmm0, xmm4
+
+ mov ebx, 2
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, [esp+640-320]
+ psraw xmm4, 2
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm7
+ movdqa [esp+640-576], xmm4
+ pcmpgtw xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-512]
+ movdqa [esp+640-624], xmm7
+ movdqa xmm7, xmm1
+ psubw xmm7, xmm6
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-544], xmm4
+ movdqa xmm4, [esp+640-512]
+ movdqa xmm7, xmm5
+ psubw xmm7, [esp+640-416]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm4, xmm7
+
+ pand xmm4, [esp+640-560]
+ movdqa [esp+640-560], xmm4
+
+ movdqa xmm4, [esp+640-544]
+ pandn xmm4, xmm6
+ movdqa [esp+640-16], xmm4
+ mov ebx, 4
+ movsx ebx, bx
+ movd xmm4, ebx
+ movdqa xmm7, xmm4
+ punpcklwd xmm7, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm2
+ psllw xmm4, 1
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, xmm6
+ paddw xmm4, [esp+640-480]
+
+ movdqa xmm6, [esp+640-560]
+ pshufd xmm7, xmm7, 0
+ paddw xmm4, xmm1
+ movdqa [esp+640-592], xmm7
+ paddw xmm4, xmm5
+ paddw xmm4, xmm7
+ movdqa xmm7, [esp+640-416]
+ pandn xmm6, xmm7
+ movdqa [esp+640-80], xmm6
+ movdqa xmm6, [esp+752-272]
+ punpcklbw xmm6, xmm2
+ psllw xmm6, 1
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-384]
+
+ movdqa xmm7, [esp+640-480]
+ paddw xmm6, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, [esp+640-592]
+ psraw xmm6, 3
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-112], xmm6
+ movdqa xmm6, [esp+640-544]
+ pandn xmm6, xmm7
+ movdqa [esp+640-336], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-528], xmm6
+ movdqa xmm6, [esp+640-368]
+ paddw xmm6, xmm7
+ movdqa xmm7, xmm1
+ psraw xmm4, 3
+ pand xmm4, [esp+640-544]
+ paddw xmm7, xmm5
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+
+ paddw xmm5, xmm1
+ psraw xmm6, 2
+ pand xmm7, xmm6
+
+ movdqa xmm6, [esp+640-384]
+ movdqa [esp+640-64], xmm7
+ movdqa xmm7, [esp+640-560]
+ pandn xmm7, xmm6
+ movdqa [esp+640-304], xmm7
+ movdqa xmm7, [esp+640-560]
+ movdqa [esp+640-528], xmm7
+ movdqa xmm7, [esp+640-416]
+ paddw xmm7, xmm6
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pand xmm5, xmm7
+ movdqa [esp+640-32], xmm5
+
+ movdqa xmm5, [esp+640-544]
+ movdqa [esp+640-528], xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa xmm7, xmm5
+ paddw xmm7, xmm5
+ movdqa xmm5, xmm1
+ paddw xmm5, xmm6
+ paddw xmm6, [esp+640-592]
+ paddw xmm7, xmm5
+ paddw xmm7, [esp+640-624]
+ movdqa xmm5, [esp+640-528]
+ psraw xmm7, 2
+ pandn xmm5, xmm7
+ movdqa xmm7, [esp+640-480]
+ paddw xmm7, xmm1
+ paddw xmm7, [esp+640-400]
+ movdqa xmm1, [esp+640-544]
+ movdqa [esp+640-352], xmm5
+ movdqa xmm5, [esp+640-368]
+ psllw xmm7, 1
+ paddw xmm7, xmm6
+ paddw xmm5, xmm7
+
+ movdqa xmm7, [esp+640-400]
+ psraw xmm5, 3
+ pand xmm1, xmm5
+ movdqa xmm5, [esp+640-480]
+ movdqa [esp+640-96], xmm1
+ movdqa xmm1, [esp+640-560]
+ movdqa [esp+640-528], xmm1
+ movdqa xmm1, [esp+640-384]
+ movdqa xmm6, xmm1
+ paddw xmm6, xmm1
+ paddw xmm1, [esp+640-400]
+ paddw xmm1, [esp+640-144]
+ paddw xmm7, xmm5
+ paddw xmm5, [esp+640-592]
+ paddw xmm6, xmm7
+ paddw xmm6, [esp+640-624]
+ movdqa xmm7, [esp+640-528]
+ psraw xmm6, 2
+ psllw xmm1, 1
+ paddw xmm1, xmm5
+
+ movdqa xmm5, [esp+656-272]
+ pandn xmm7, xmm6
+ movdqa xmm6, [esp+640-416]
+ paddw xmm6, xmm1
+ movdqa xmm1, [esp+640-560]
+ psraw xmm6, 3
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+704-272]
+ movdqa [esp+640-128], xmm1
+ movdqa xmm1, [esp+672-272]
+ punpckhbw xmm1, xmm2
+ movdqa [esp+640-448], xmm1
+ movdqa xmm1, [esp+688-272]
+ punpckhbw xmm1, xmm2
+ punpckhbw xmm6, xmm2
+ movdqa [esp+640-288], xmm7
+ punpckhbw xmm5, xmm2
+ movdqa [esp+640-496], xmm1
+ movdqa [esp+640-432], xmm6
+
+ movdqa xmm7, [esp+720-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-464], xmm7
+
+ movdqa xmm7, [esp+736-272]
+ punpckhbw xmm7, xmm2
+ movdqa [esp+640-528], xmm7
+
+ movdqa xmm7, xmm6
+
+ psubw xmm6, [esp+640-464]
+ psubw xmm7, xmm1
+ pabsw xmm7, xmm7
+ movdqa [esp+640-560], xmm7
+ por xmm4, [esp+640-16]
+ pabsw xmm6, xmm6
+ movdqa xmm7, xmm1
+ psubw xmm7, [esp+640-448]
+
+ movdqa xmm1, [esp+640-512]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm1, xmm7
+ movdqa xmm7, [esp+640-512]
+ pcmpgtw xmm7, xmm6
+ movdqa xmm6, [esp+640-320]
+ pand xmm1, xmm7
+ movdqa xmm7, [esp+640-560]
+ pcmpgtw xmm6, xmm7
+ pand xmm1, xmm6
+
+ movdqa xmm6, [esp+640-576]
+ pcmpgtw xmm6, xmm7
+
+ movdqa xmm7, [esp+640-496]
+ punpckhbw xmm3, xmm2
+ movdqa [esp+640-560], xmm6
+ movdqa xmm6, [esp+640-512]
+ psubw xmm7, xmm5
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+
+ pand xmm6, [esp+640-560]
+ movdqa xmm7, [esp+640-432]
+ psubw xmm7, [esp+640-528]
+
+ psllw xmm3, 1
+ movdqa [esp+640-544], xmm6
+ movdqa xmm6, [esp+640-512]
+
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, xmm5
+ paddw xmm3, [esp+640-448]
+ paddw xmm3, [esp+640-496]
+ pabsw xmm7, xmm7
+ pcmpgtw xmm6, xmm7
+ pand xmm6, [esp+640-560]
+ movdqa [esp+640-560], xmm6
+
+ movdqa xmm6, xmm0
+ pand xmm6, xmm4
+ movdqa xmm4, xmm0
+ pandn xmm4, [esp+640-368]
+ por xmm6, xmm4
+ movdqa xmm4, [esp+640-432]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-592]
+ psraw xmm3, 3
+ pand xmm3, xmm2
+ pandn xmm2, xmm5
+ por xmm3, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm3
+ movdqa xmm3, [esp+640-64]
+ por xmm3, [esp+640-336]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm5
+ por xmm7, xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-480]
+ por xmm2, xmm3
+ packuswb xmm6, xmm7
+ movdqa [esp+640-336], xmm2
+ movdqa [esp+656-272], xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa xmm2, xmm5
+ paddw xmm2, [esp+640-448]
+ movdqa xmm3, xmm1
+ movdqa xmm7, [esp+640-496]
+ paddw xmm7, xmm4
+ paddw xmm2, xmm7
+ paddw xmm2, [esp+640-624]
+ movdqa xmm7, [esp+640-544]
+ psraw xmm2, 2
+ pand xmm6, xmm2
+ movdqa xmm2, [esp+640-448]
+ pandn xmm7, xmm2
+ por xmm6, xmm7
+ pand xmm3, xmm6
+ movdqa xmm6, xmm1
+ pandn xmm6, xmm2
+ paddw xmm2, [esp+640-496]
+ paddw xmm2, xmm4
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-336]
+ packuswb xmm6, xmm3
+ psllw xmm2, 1
+ movdqa [esp+672-272], xmm6
+ movdqa xmm6, [esp+640-96]
+ por xmm6, [esp+640-352]
+
+ movdqa xmm3, xmm0
+ pand xmm3, xmm6
+ movdqa xmm6, xmm0
+ pandn xmm6, [esp+640-144]
+ por xmm3, xmm6
+ movdqa xmm6, [esp+640-544]
+ movdqa [esp+640-352], xmm3
+ movdqa xmm3, [esp+640-464]
+ paddw xmm3, [esp+640-592]
+ paddw xmm2, xmm3
+ movdqa xmm3, [esp+640-448]
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-496]
+ psraw xmm5, 3
+ pand xmm6, xmm5
+ movdqa xmm5, [esp+640-464]
+ paddw xmm2, xmm5
+ paddw xmm5, [esp+640-432]
+ movdqa xmm4, xmm3
+ paddw xmm4, xmm3
+ paddw xmm4, xmm2
+ paddw xmm4, [esp+640-624]
+ movdqa xmm2, [esp+640-544]
+ paddw xmm3, [esp+640-592]
+ psraw xmm4, 2
+ pandn xmm2, xmm4
+ por xmm6, xmm2
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-496]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm6
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-352]
+ packuswb xmm2, xmm7
+ movdqa [esp+688-272], xmm2
+ movdqa xmm2, [esp+640-128]
+ por xmm2, [esp+640-288]
+
+ movdqa xmm4, xmm0
+ pand xmm4, xmm2
+ paddw xmm5, xmm6
+ movdqa xmm2, xmm0
+ pandn xmm2, [esp+640-400]
+ por xmm4, xmm2
+ movdqa xmm2, [esp+640-528]
+ psllw xmm5, 1
+ paddw xmm5, xmm3
+ movdqa xmm3, [esp+640-560]
+ paddw xmm2, xmm5
+ psraw xmm2, 3
+ movdqa [esp+640-288], xmm4
+ movdqa xmm4, [esp+640-560]
+ pand xmm4, xmm2
+ movdqa xmm2, [esp+640-464]
+ movdqa xmm5, xmm2
+ paddw xmm5, xmm2
+ movdqa xmm2, [esp+640-432]
+ paddw xmm2, [esp+640-448]
+ movdqa xmm7, xmm1
+ paddw xmm5, xmm2
+ paddw xmm5, [esp+640-624]
+ movdqa xmm6, [esp+640-560]
+ psraw xmm5, 2
+ pandn xmm3, xmm5
+ por xmm4, xmm3
+ movdqa xmm3, [esp+640-32]
+ por xmm3, [esp+640-304]
+ pand xmm7, xmm4
+ movdqa xmm4, [esp+640-432]
+ movdqa xmm5, [esp+640-464]
+ movdqa xmm2, xmm1
+ pandn xmm2, xmm4
+ paddw xmm4, [esp+640-496]
+ por xmm7, xmm2
+ movdqa xmm2, [esp+640-288]
+ packuswb xmm2, xmm7
+ movdqa [esp+704-272], xmm2
+
+ movdqa xmm2, xmm0
+ pand xmm2, xmm3
+ movdqa xmm3, xmm0
+ pandn xmm3, [esp+640-384]
+ por xmm2, xmm3
+ movdqa [esp+640-304], xmm2
+ movdqa xmm2, [esp+640-528]
+ movdqa xmm3, xmm2
+ paddw xmm3, [esp+640-464]
+ paddw xmm3, xmm4
+ paddw xmm3, [esp+640-624]
+ psraw xmm3, 2
+ pand xmm6, xmm3
+ movdqa xmm3, [esp+640-560]
+ movdqa xmm4, xmm3
+ pandn xmm4, xmm5
+ por xmm6, xmm4
+ movdqa xmm7, xmm1
+ pand xmm7, xmm6
+ movdqa xmm6, [esp+640-304]
+ movdqa xmm4, xmm1
+ pandn xmm4, xmm5
+ por xmm7, xmm4
+
+ movdqa xmm4, xmm0
+ pandn xmm0, [esp+640-416]
+ packuswb xmm6, xmm7
+ movdqa xmm7, [esp+640-112]
+ por xmm7, [esp+640-80]
+ pand xmm4, xmm7
+ por xmm4, xmm0
+ movdqa xmm0, [esp+752-272]
+ punpckhbw xmm0, [esp+640-48]
+ psllw xmm0, 1
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm2
+ paddw xmm0, xmm5
+ paddw xmm0, [esp+640-432]
+ paddw xmm0, [esp+640-496]
+ paddw xmm0, [esp+640-592]
+ psraw xmm0, 3
+ pand xmm0, xmm3
+ movdqa xmm7, xmm1
+ pandn xmm3, xmm2
+ por xmm0, xmm3
+ pand xmm7, xmm0
+
+ movdqa xmm0, [esp+656-272]
+ movdqa [edx], xmm0
+
+ movdqa xmm0, [esp+672-272]
+
+ mov edx, dword [esp+640-596]
+ movdqa [esi], xmm0
+ movdqa xmm0, [esp+688-272]
+ movdqa [edi], xmm0
+ movdqa xmm0, [esp+704-272]
+
+ pop edi
+ pandn xmm1, xmm2
+ movdqa [eax], xmm0
+ por xmm7, xmm1
+ pop esi
+ packuswb xmm4, xmm7
+ movdqa [edx], xmm6
+ movdqa [ecx], xmm4
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
+
+%endif
+
+
+
+;********************************************************************************
+;
+; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
+;
+;********************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeH2V_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeH2V_sse2:
+ push r3
+ push r4
+ push r5
+
+%assign push_num 3
+ LOAD_3_PARA
+
+ SIGN_EXTENTION r1, r1d
+
+ mov r5, r7
+ mov r3, r7
+ and r3, 0Fh
+ sub r7, r3
+ sub r7, 10h
+
+ lea r3, [r0 + r1 * 8]
+ lea r4, [r1 * 3]
+
+ movq xmm0, [r0]
+ movq xmm7, [r3]
+ punpcklqdq xmm0, xmm7
+ movq xmm1, [r0 + r1]
+ movq xmm7, [r3 + r1]
+ punpcklqdq xmm1, xmm7
+ movq xmm2, [r0 + r1*2]
+ movq xmm7, [r3 + r1*2]
+ punpcklqdq xmm2, xmm7
+ movq xmm3, [r0 + r4]
+ movq xmm7, [r3 + r4]
+ punpcklqdq xmm3, xmm7
+
+ lea r0, [r0 + r1 * 4]
+ lea r3, [r3 + r1 * 4]
+ movq xmm4, [r0]
+ movq xmm7, [r3]
+ punpcklqdq xmm4, xmm7
+ movq xmm5, [r0 + r1]
+ movq xmm7, [r3 + r1]
+ punpcklqdq xmm5, xmm7
+ movq xmm6, [r0 + r1*2]
+ movq xmm7, [r3 + r1*2]
+ punpcklqdq xmm6, xmm7
+
+ movdqa [r7], xmm0
+ movq xmm7, [r0 + r4]
+ movq xmm0, [r3 + r4]
+ punpcklqdq xmm7, xmm0
+ movdqa xmm0, [r7]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ movdqa [r2], xmm4
+ movdqa [r2 + 10h], xmm2
+ movdqa [r2 + 20h], xmm3
+ movdqa [r2 + 30h], xmm7
+ movdqa [r2 + 40h], xmm5
+ movdqa [r2 + 50h], xmm1
+ movdqa [r2 + 60h], xmm6
+ movdqa [r2 + 70h], xmm0
+
+ mov r7, r5
+ pop r5
+ pop r4
+ pop r3
+ ret
+
+
+;*******************************************************************************************
+;
+; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN DeblockLumaTransposeV2H_sse2
+
+ALIGN 16
+
+DeblockLumaTransposeV2H_sse2:
+ push r3
+ push r4
+
+%assign push_num 2
+ LOAD_3_PARA
+
+ SIGN_EXTENTION r1, r1d
+
+ mov r4, r7
+ mov r3, r7
+ and r3, 0Fh
+ sub r7, r3
+ sub r7, 10h
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2 + 10h]
+ movdqa xmm2, [r2 + 20h]
+ movdqa xmm3, [r2 + 30h]
+ movdqa xmm4, [r2 + 40h]
+ movdqa xmm5, [r2 + 50h]
+ movdqa xmm6, [r2 + 60h]
+ movdqa xmm7, [r2 + 70h]
+
+ SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
+ ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+
+ lea r2, [r1 * 3]
+
+ movq [r0], xmm4
+ movq [r0 + r1], xmm2
+ movq [r0 + r1*2], xmm3
+ movq [r0 + r2], xmm7
+
+ lea r0, [r0 + r1*4]
+ movq [r0], xmm5
+ movq [r0 + r1], xmm1
+ movq [r0 + r1*2], xmm6
+ movq [r0 + r2], xmm0
+
+ psrldq xmm4, 8
+ psrldq xmm2, 8
+ psrldq xmm3, 8
+ psrldq xmm7, 8
+ psrldq xmm5, 8
+ psrldq xmm1, 8
+ psrldq xmm6, 8
+ psrldq xmm0, 8
+
+ lea r0, [r0 + r1*4]
+ movq [r0], xmm4
+ movq [r0 + r1], xmm2
+ movq [r0 + r1*2], xmm3
+ movq [r0 + r2], xmm7
+
+ lea r0, [r0 + r1*4]
+ movq [r0], xmm5
+ movq [r0 + r1], xmm1
+ movq [r0 + r1*2], xmm6
+ movq [r0 + r2], xmm0
+
+
+ mov r7, r4
+ pop r4
+ pop r3
+ ret
+
--- /dev/null
+++ b/codec/common/expand_picture.asm
@@ -1,0 +1,740 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* expand_picture.asm
+;*
+;* Abstract
+;* mmxext/sse for expand_frame
+;*
+;* History
+;* 09/25/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata pData align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+;%define PADDING_SIZE_ASM 32 ; PADDING_LENGTH
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+
+
+SECTION .text
+
+WELS_EXTERN ExpandPictureLuma_sse2
+WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
+WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
+
+;;;;;;;expanding result;;;;;;;
+
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;aaaa|attttttttttttttttb|bbbb
+;----------------------------
+;aaaa|attttttttttttttttb|bbbb
+;llll|l r|rrrr
+;llll|l r|rrrr
+;llll|l r|rrrr
+;llll|l r|rrrr
+;llll|l r|rrrr
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;----------------------------
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+;cccc|ceeeeeeeeeeeeeeeed|dddd
+
+%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+2*%2]
+ movq [%1], %3
+ movq [%1+%2], %3
+ lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
+ movdq%4 [%1], %3 ; top(bottom)_0
+ movdq%4 [%1+%2], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdq%4 [%1], %3 ; top(bottom)_2
+ movdq%4 [%1+%2], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
+%endmacro
+
+%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+2*%2]
+%endmacro
+
+%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
+ movdqa [%1], %3 ; top(bottom)_0
+ movdqa [%1+16], %3 ; top(bottom)_0
+ movdqa [%1+%2], %3 ; top(bottom)_1
+ movdqa [%1+%2+16], %3 ; top(bottom)_1
+ lea %1, [%1+2*%2]
+ movdqa [%1], %3 ; top(bottom)_2
+ movdqa [%1+16], %3 ; top(bottom)_2
+ movdqa [%1+%2], %3 ; top(bottom)_3
+ movdqa [%1+%2+16], %3 ; top(bottom)_3
+ lea %1, [%1+%2]
+%endmacro
+
+%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
+ ;r2 [width/16(8)]
+ ;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
+ ;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
+
+%if %1 == 32 ; for luma
+ sar r2, 04h ; width / 16(8) pixels
+.top_bottom_loops:
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
+
+ dec r2
+ jnz near .top_bottom_loops
+%elif %1 == 16 ; for chroma ??
+ mov r6, r2
+ sar r2, 04h ; (width / 16) pixels
+.top_bottom_loops:
+ ; top
+ movdqa xmm0, [r0] ; first line of picture pData
+ mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_16x4_sse2 r5, r1, xmm0, a
+ mov_line_end16x4_sse2 r5, r1, xmm0, a
+
+ ; bottom
+ movdqa xmm1, [r3] ; last line of picture pData
+ mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_16x4_sse2 r4, r1, xmm1, a
+ mov_line_end16x4_sse2 r4, r1, xmm1, a
+
+ lea r0, [r0+16] ; top pSrc
+ lea r5, [r5+16] ; top dst
+ lea r3, [r3+16] ; bottom pSrc
+ lea r4, [r4+16] ; bottom dst
+ neg r1 ; positive/negative stride need for next loop?
+
+ dec r2
+ jnz near .top_bottom_loops
+
+ ; for remaining 8 bytes
+ and r6, 0fh ; any 8 bytes left?
+ test r6, r6
+ jz near .to_be_continued ; no left to exit here
+
+ ; top
+ movq mm0, [r0] ; remained 8 byte
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
+ ; bottom
+ movq mm1, [r3]
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
+ WELSEMMS
+
+.to_be_continued:
+%endif
+%endmacro
+
+%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+ ;r6 [height]
+ ;r0 [pSrc+0] r5[pSrc-32] r1[stride]
+ ;r3 [pSrc+(w-1)] r4[pSrc+w]
+
+%if %1 == 32 ; for luma
+.left_right_loops:
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
+ movdqa [r5+16], xmm0
+
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r4], xmm1
+ movdqa [r4+16], xmm1
+
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
+
+ dec r6
+ jnz near .left_right_loops
+%elif %1 == 16 ; for chroma ??
+.left_right_loops:
+ ; left
+ movzx r2d, byte [r0] ; pixel pData for left border
+ SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdqa [r5], xmm0
+
+ ; right
+ movzx r2d, byte [r3]
+ SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
+ movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
+
+ lea r0, [r0+r1] ; left pSrc
+ lea r5, [r5+r1] ; left dst
+ lea r3, [r3+r1] ; right pSrc
+ lea r4, [r4+r1] ; right dst
+
+ dec r6
+ jnz near .left_right_loops
+%endif
+%endmacro
+
+%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
+ ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
+ ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
+ ;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
+%if %1 == 32 ; luma
+ ; TL
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
+
+ ; TR
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
+
+ ; BL
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
+
+ ; BR
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+ mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
+%elif %1 == 16 ; chroma
+ ; TL
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
+
+ ; TR
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
+
+ ; BL
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
+
+ ; BR
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+ mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
+%endif
+%endmacro
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureLuma_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
+;***********************************************************************----------------
+ExpandPictureLuma_sse2:
+
+ push r4
+ push r5
+ push r6
+
+ %assign push_num 3
+ LOAD_4_PARA
+
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r2, r2d
+ SIGN_EXTENTION r3, r3d
+
+ ;also prepare for cross border pData top-left:xmm3
+
+ movzx r6d,byte[r0]
+ SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
+
+ neg r1
+ lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
+ neg r1
+
+ push r3
+
+
+ dec r3 ;h-1
+ imul r3,r1 ;(h-1)*stride
+ lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
+
+ mov r6,r1 ;r6 = stride
+ sal r6,05h ;r6 = 32*stride
+ lea r4,[r3+r6] ;r4 = dst bottom
+
+ ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+ movzx r6d,byte [r3] ;bottom-left
+ SSE2_Copy16Times xmm5,r6d
+
+ lea r6,[r3+r2-1]
+ movzx r6d,byte [r6]
+ SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+ neg r1 ;r1 = -stride
+
+ push r0
+ push r1
+ push r2
+
+ exp_top_bottom_sse2 32
+
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pop r2
+ pop r1
+ pop r0
+
+ lea r5,[r0-32] ;left border dst luma =32 chroma = -16
+
+ lea r3,[r0+r2-1] ;right border src
+ lea r4,[r3+1] ;right border dst
+
+ ;prepare for cross border data: top-rigth with xmm4
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
+
+ neg r1 ;r1 = stride
+
+
+ pop r6 ; r6 = height
+
+
+
+ push r0
+ push r1
+ push r2
+ push r6
+
+ exp_left_right_sse2 32,a
+
+ pop r6
+ pop r2
+ pop r1
+ pop r0
+
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+ neg r1 ;r1 = -stride
+ lea r3,[r0-32]
+ lea r3,[r3+r1] ;last line of top-left border
+
+ lea r4,[r0+r2] ;psrc +width
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
+ neg r1 ;r1 = stride
+ add r6,32 ;height +32(16) ,luma = 32, chroma = 16
+ imul r6,r1
+
+ lea r5,[r3+r6] ;last line of bottom-left border
+ lea r6,[r4+r6] ;last line of botoom-right border
+
+ neg r1 ; r1 = -stride
+
+ ; for left & right border expanding
+ exp_cross_sse2 32,a
+
+ LOAD_4_PARA_POP
+
+ pop r6
+ pop r5
+ pop r4
+
+ %assign push_num 0
+
+
+ ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
+;***********************************************************************----------------
+ExpandPictureChromaAlign_sse2:
+
+ push r4
+ push r5
+ push r6
+
+ %assign push_num 3
+ LOAD_4_PARA
+
+ SIGN_EXTENTION r1,r1d
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+
+ ;also prepare for cross border pData top-left:xmm3
+
+ movzx r6d,byte [r0]
+ SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
+
+ neg r1
+ lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
+ neg r1
+
+ push r3
+
+
+ dec r3 ;h-1
+ imul r3,r1 ;(h-1)*stride
+ lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
+
+ mov r6,r1 ;r6 = stride
+ sal r6,04h ;r6 = 32*stride
+ lea r4,[r3+r6] ;r4 = dst bottom
+
+ ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+ movzx r6d,byte [r3] ;bottom-left
+ SSE2_Copy16Times xmm5,r6d
+
+ lea r6,[r3+r2-1]
+ movzx r6d,byte [r6]
+ SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+ neg r1 ;r1 = -stride
+
+ push r0
+ push r1
+ push r2
+
+ exp_top_bottom_sse2 16
+
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pop r2
+ pop r1
+ pop r0
+
+ lea r5,[r0-16] ;left border dst luma =32 chroma = -16
+
+ lea r3,[r0+r2-1] ;right border src
+ lea r4,[r3+1] ;right border dst
+
+ ;prepare for cross border data: top-rigth with xmm4
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
+
+ neg r1 ;r1 = stride
+
+
+ pop r6 ; r6 = height
+
+
+
+ push r0
+ push r1
+ push r2
+ push r6
+ exp_left_right_sse2 16,a
+
+ pop r6
+ pop r2
+ pop r1
+ pop r0
+
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+ neg r1 ;r1 = -stride
+ lea r3,[r0-16]
+ lea r3,[r3+r1] ;last line of top-left border
+
+ lea r4,[r0+r2] ;psrc +width
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
+ neg r1 ;r1 = stride
+ add r6,16 ;height +32(16) ,luma = 32, chroma = 16
+ imul r6,r1
+
+ lea r5,[r3+r6] ;last line of bottom-left border
+ lea r6,[r4+r6] ;last line of botoom-right border
+
+ neg r1 ; r1 = -stride
+
+ ; for left & right border expanding
+ exp_cross_sse2 16,a
+
+ LOAD_4_PARA_POP
+
+ pop r6
+ pop r5
+ pop r4
+
+ %assign push_num 0
+
+
+ ret
+
+ALIGN 16
+;***********************************************************************----------------
+; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
+; const int32_t iStride,
+; const int32_t iWidth,
+; const int32_t iHeight );
+;***********************************************************************----------------
+ExpandPictureChromaUnalign_sse2:
+ push r4
+ push r5
+ push r6
+
+ %assign push_num 3
+ LOAD_4_PARA
+
+ SIGN_EXTENTION r1,r1d
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+
+ ;also prepare for cross border pData top-left:xmm3
+
+ movzx r6d,byte [r0]
+ SSE2_Copy16Times xmm3,r6d ;xmm3: pSrc[0]
+
+ neg r1
+ lea r5,[r0+r1] ;last line of top border r5= dst top pSrc[-stride]
+ neg r1
+
+ push r3
+
+
+ dec r3 ;h-1
+ imul r3,r1 ;(h-1)*stride
+ lea r3,[r0+r3] ;pSrc[(h-1)*stride] r3 = src bottom
+
+ mov r6,r1 ;r6 = stride
+ sal r6,04h ;r6 = 32*stride
+ lea r4,[r3+r6] ;r4 = dst bottom
+
+ ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
+
+ movzx r6d,byte [r3] ;bottom-left
+ SSE2_Copy16Times xmm5,r6d
+
+ lea r6,[r3+r2-1]
+ movzx r6d,byte [r6]
+ SSE2_Copy16Times xmm6,r6d ;bottom-right
+
+ neg r1 ;r1 = -stride
+
+ push r0
+ push r1
+ push r2
+
+ exp_top_bottom_sse2 16
+
+ ; for both left and right border
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pop r2
+ pop r1
+ pop r0
+
+ lea r5,[r0-16] ;left border dst luma =32 chroma = -16
+
+ lea r3,[r0+r2-1] ;right border src
+ lea r4,[r3+1] ;right border dst
+
+ ;prepare for cross border data: top-rigth with xmm4
+ movzx r6d,byte [r3] ;top -rigth
+ SSE2_Copy16Times xmm4,r6d
+
+ neg r1 ;r1 = stride
+
+
+ pop r6 ; r6 = height
+
+
+
+ push r0
+ push r1
+ push r2
+ push r6
+ exp_left_right_sse2 16,u
+
+ pop r6
+ pop r2
+ pop r1
+ pop r0
+
+ ; for cross border [top-left, top-right, bottom-left, bottom-right]
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
+
+ neg r1 ;r1 = -stride
+ lea r3,[r0-16]
+ lea r3,[r3+r1] ;last line of top-left border
+
+ lea r4,[r0+r2] ;psrc +width
+ lea r4,[r4+r1] ;psrc +width -stride
+
+
+ neg r1 ;r1 = stride
+ add r6,16 ;height +32(16) ,luma = 32, chroma = 16
+ imul r6,r1
+
+ lea r5,[r3+r6] ;last line of bottom-left border
+ lea r6,[r4+r6] ;last line of botoom-right border
+
+ neg r1 ; r1 = -stride
+
+ ; for left & right border expanding
+ exp_cross_sse2 16,u
+
+ LOAD_4_PARA_POP
+
+ pop r6
+ pop r5
+ pop r4
+
+ %assign push_num 0
+
+
+ ret
+
\ No newline at end of file
--- /dev/null
+++ b/codec/common/mb_copy.asm
@@ -1,0 +1,701 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mb_copy.asm
+;*
+;* Abstract
+;* mb_copy and mb_copy1
+;*
+;* History
+;* 15/09/2009 Created
+;* 12/28/2009 Modified with larger throughput
+;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
+;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
+;*
+;*
+;*********************************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN WelsCopy16x16_sse2
+WELS_EXTERN WelsCopy16x16NotAligned_sse2
+WELS_EXTERN WelsCopy8x8_mmx
+WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
+WELS_EXTERN WelsCopy8x16_mmx ;
+WELS_EXTERN UpdateMbMv_sse2 ;
+
+;***********************************************************************
+; void WelsCopy16x16_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x16_sse2:
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
+
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+2*r3]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+2*r3]
+ movdqa xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+;***********************************************************************
+; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
+WelsCopy16x16NotAligned_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ ;mov edi, [esp+16] ; Dst
+ ;mov eax, [esp+20] ; iStrideD
+ ;mov esi, [esp+24] ; Src
+ ;mov ecx, [esp+28] ; iStrideS
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+ lea r2, [r2+4*r3]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ lea r0, [r0+4*r1]
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+; , 12/29/2011
+;***********************************************************************
+; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy16x8NotAligned_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ ;mov edi, [esp+16] ; Dst
+ ;mov eax, [esp+20] ; iStrideD
+ ;mov esi, [esp+24] ; Src
+ ;mov ecx, [esp+28] ; iStrideS
+
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+
+ lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
+ lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
+
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r2+r3]
+ movdqu xmm2, [r2+2*r3]
+ movdqu xmm3, [r2+r5]
+ lea r2, [r2+4*r3]
+ movdqu xmm4, [r2]
+ movdqu xmm5, [r2+r3]
+ movdqu xmm6, [r2+2*r3]
+ movdqu xmm7, [r2+r5]
+
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm2
+ movdqa [r0+r4], xmm3
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm4
+ movdqa [r0+r1], xmm5
+ movdqa [r0+2*r1], xmm6
+ movdqa [r0+r4], xmm7
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+
+;***********************************************************************
+; void WelsCopy8x16_mmx(uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x16_mmx:
+ ;push ebx
+
+ ;mov eax, [esp + 8 ] ;Dst
+ ;mov ecx, [esp + 12] ;iStrideD
+ ;mov ebx, [esp + 16] ;Src
+ ;mov edx, [esp + 20] ;iStrideS
+
+ %assign push_num 0
+ LOAD_4_PARA
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+ lea r2, [r2+2*r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+ lea r0, [r0+2*r1]
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+; void WelsCopy8x8_mmx( uint8_t* Dst,
+; int32_t iStrideD,
+; uint8_t* Src,
+; int32_t iStrideS )
+;***********************************************************************
+ALIGN 16
+WelsCopy8x8_mmx:
+ ;push ebx
+ ;push esi
+ ;mov eax, [esp + 12] ;Dst
+ ;mov ecx, [esp + 16] ;iStrideD
+ ;mov esi, [esp + 20] ;Src
+ ;mov ebx, [esp + 24] ;iStrideS
+
+ push r4
+ %assign push_num 1
+ LOAD_4_PARA
+ lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
+
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ ; to prefetch next loop
+ prefetchnta [r2+2*r3]
+ prefetchnta [r2+r4]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+ movq mm7, [r2+r3]
+
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ lea r0, [r0+2*r1]
+ movq [r0], mm2
+ movq [r0+r1], mm3
+ lea r0, [r0+2*r1]
+ movq [r0], mm4
+ movq [r0+r1], mm5
+ lea r0, [r0+2*r1]
+ movq [r0], mm6
+ movq [r0+r1], mm7
+
+ WELSEMMS
+ ;pop esi
+ ;pop ebx
+ LOAD_4_PARA_POP
+ pop r4
+ ret
+
+; (dunhuang@cisco), 12/21/2011
+;***********************************************************************
+; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
+;***********************************************************************
+ALIGN 16
+UpdateMbMv_sse2:
+
+ %assign push_num 0
+ LOAD_2_PARA
+
+ ;mov eax, [esp+4] ; mv_buffer
+ ;movd xmm0, [esp+8] ; _mv
+ movd xmm0, r1d ; _mv
+ pshufd xmm1, xmm0, $0
+ movdqa [r0 ], xmm1
+ movdqa [r0+0x10], xmm1
+ movdqa [r0+0x20], xmm1
+ movdqa [r0+0x30], xmm1
+ ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+
+;SECTION .rodata data align=16
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN PixelAvgWidthEq4_mmx
+WELS_EXTERN PixelAvgWidthEq8_mmx
+WELS_EXTERN PixelAvgWidthEq16_sse2
+
+WELS_EXTERN McCopyWidthEq4_mmx
+WELS_EXTERN McCopyWidthEq8_mmx
+WELS_EXTERN McCopyWidthEq16_sse2
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq4_mmx:
+
+ %assign push_num 0
+ LOAD_7_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+ movsx r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+ movd mm0, [r4]
+ pavgb mm0, [r2]
+ movd [r0], mm0
+
+ dec r6
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ lea r4, [r4+r5]
+ jne .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq8_mmx:
+
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;push ebx
+
+ ;mov edi, [esp+20] ; pDst
+ ;mov eax, [esp+24] ; iDstStride
+ ;mov esi, [esp+28] ; pSrcA
+ ;mov ecx, [esp+32] ; iSrcAStride
+ ;mov ebp, [esp+36] ; pSrcB
+ ;mov edx, [esp+40] ; iSrcBStride
+ ;mov ebx, [esp+44] ; iHeight
+
+ %assign push_num 0
+ LOAD_7_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+ movsx r6, r6d
+%endif
+
+ALIGN 4
+.height_loop:
+ movq mm0, [r2]
+ pavgb mm0, [r4]
+ movq [r0], mm0
+ movq mm0, [r2+r3]
+ pavgb mm0, [r4+r5]
+ movq [r0+r1], mm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ sub r6, 2
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
+; uint8_t *pSrcA, int iSrcAStride,
+; uint8_t *pSrcB, int iSrcBStride,
+; int iHeight );
+;*******************************************************************************
+PixelAvgWidthEq16_sse2:
+
+ %assign push_num 0
+ LOAD_7_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+ movsx r6, r6d
+%endif
+ALIGN 4
+.height_loop:
+ movdqu xmm0, [r2]
+ movdqu xmm1, [r4]
+ pavgb xmm0, xmm1
+ ;pavgb xmm0, [r4]
+ movdqu [r0], xmm0
+
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+r1], xmm0
+
+ movdqu xmm0, [r2+2*r3]
+ movdqu xmm1, [r4+2*r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+2*r1], xmm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ movdqu xmm0, [r2+r3]
+ movdqu xmm1, [r4+r5]
+ pavgb xmm0, xmm1
+ movdqu [r0+r1], xmm0
+
+ lea r2, [r2+2*r3]
+ lea r4, [r4+2*r5]
+ lea r0, [r0+2*r1]
+
+ sub r6, 4
+ jne .height_loop
+
+ WELSEMMS
+ LOAD_7_PARA_POP
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq4_mmx:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+
+ ;mov esi, [esp+16]
+ ;mov eax, [esp+20]
+ ;mov edi, [esp+24]
+ ;mov ecx, [esp+28]
+ ;mov edx, [esp+32]
+
+ push r5
+ %assign push_num 1
+ LOAD_5_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+ mov r5d, [r0]
+ mov [r2], r5d
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+ WELSEMMS
+ LOAD_5_PARA_POP
+ pop r5
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
+; uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+McCopyWidthEq8_mmx:
+ ;push esi
+ ;push edi
+ ;mov esi, [esp+12]
+ ;mov eax, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+
+ %assign push_num 0
+ LOAD_5_PARA
+
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+
+ALIGN 4
+.height_loop:
+ movq mm0, [r0]
+ movq [r2], mm0
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
+;*******************************************************************************
+;read unaligned memory
+%macro SSE_READ_UNA 2
+ movq %1, [%2]
+ movhps %1, [%2+8]
+%endmacro
+
+;write unaligned memory
+%macro SSE_WRITE_UNA 2
+ movq [%1], %2
+ movhps [%1+8], %2
+%endmacro
+McCopyWidthEq16_sse2:
+ ;push esi
+ ;push edi
+
+ ;mov esi, [esp+12] ; pSrc
+ ;mov eax, [esp+16] ; iSrcStride
+ ;mov edi, [esp+20] ; pDst
+ ;mov edx, [esp+24] ; iDstStride
+ ;mov ecx, [esp+28] ; iHeight
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ALIGN 4
+.height_loop:
+ SSE_READ_UNA xmm0, r0
+ SSE_READ_UNA xmm1, r0+r1
+ SSE_WRITE_UNA r2, xmm0
+ SSE_WRITE_UNA r2+r3, xmm1
+
+ sub r4, 2
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ jnz .height_loop
+
+ LOAD_5_PARA_POP
+ ret
--- /dev/null
+++ b/codec/common/mc_chroma.asm
@@ -1,0 +1,345 @@
+;*!
+;* \copy
+;* Copyright (c) 2004-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_chroma.asm
+;*
+;* Abstract
+;* mmx motion compensation for chroma
+;*
+;* History
+;* 10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+ dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+ dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+%endif
+
+ ;mov eax, [esp +12 + 20]
+
+ movd mm3, [r4]; [eax]
+ WELS_Zero mm7
+ punpcklbw mm3, mm3
+ movq mm4, mm3
+ punpcklwd mm3, mm3
+ punpckhwd mm4, mm4
+
+ movq mm5, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm5, mm7
+
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ ;mov esi, [esp +12+ 4]
+ ;mov eax, [esp + 12 + 8]
+ ;mov edi, [esp + 12 + 12]
+ ;mov edx, [esp + 12 + 16]
+ ;mov ecx, [esp + 12 + 24]
+
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movd mm0, [r0]
+ movd mm1, [r0+1]
+ punpcklbw mm0, mm7
+ punpcklbw mm1, mm7
+.xloop:
+
+ pmullw mm0, mm3
+ pmullw mm1, mm5
+ paddw mm0, mm1
+
+ movd mm1, [r4]
+ punpcklbw mm1, mm7
+ movq mm2, mm1
+ pmullw mm1, mm4
+ paddw mm0, mm1
+
+ movd mm1, [r4+1]
+ punpcklbw mm1, mm7
+ movq mm7, mm1
+ pmullw mm1,mm6
+ paddw mm0, mm1
+ movq mm1,mm7
+
+ paddw mm0, [h264_d0x20_mmx]
+ psrlw mm0, 6
+
+ WELS_Zero mm7
+ packuswb mm0, mm7
+ movd [r2], mm0
+
+ movq mm0, mm2
+
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
+
+ dec r5
+ jnz near .xloop
+ WELSEMMS
+ LOAD_6_PARA_POP
+ ;pop ebx
+ ;pop edi
+ ;pop esi
+ ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+%endif
+
+ ;mov eax, [esp +12 + 20]
+ movd xmm3, [r4]
+ WELS_Zero xmm7
+ punpcklbw xmm3, xmm3
+ punpcklwd xmm3, xmm3
+
+ movdqa xmm4, xmm3
+ punpckldq xmm3, xmm3
+ punpckhdq xmm4, xmm4
+ movdqa xmm5, xmm3
+ movdqa xmm6, xmm4
+
+ punpcklbw xmm3, xmm7
+ punpckhbw xmm5, xmm7
+ punpcklbw xmm4, xmm7
+ punpckhbw xmm6, xmm7
+
+ ;mov esi, [esp +12+ 4]
+ ;mov eax, [esp + 12 + 8]
+ ;mov edi, [esp + 12 + 12]
+ ;mov edx, [esp + 12 + 16]
+ ;mov ecx, [esp + 12 + 24]
+
+ lea r4, [r0 + r1] ;lea ebx, [esi + eax]
+ movq xmm0, [r0]
+ movq xmm1, [r0+1]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+.xloop:
+
+ pmullw xmm0, xmm3
+ pmullw xmm1, xmm5
+ paddw xmm0, xmm1
+
+ movq xmm1, [r4]
+ punpcklbw xmm1, xmm7
+ movdqa xmm2, xmm1
+ pmullw xmm1, xmm4
+ paddw xmm0, xmm1
+
+ movq xmm1, [r4+1]
+ punpcklbw xmm1, xmm7
+ movdqa xmm7, xmm1
+ pmullw xmm1, xmm6
+ paddw xmm0, xmm1
+ movdqa xmm1,xmm7
+
+ paddw xmm0, [h264_d0x20_sse2]
+ psrlw xmm0, 6
+
+ WELS_Zero xmm7
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ movdqa xmm0, xmm2
+
+ lea r2, [r2 + r3]
+ lea r4, [r4 + r1]
+
+ dec r5
+ jnz near .xloop
+
+ LOAD_6_PARA_POP
+
+ ;pop ebx
+ ;pop edi
+ ;pop esi
+ ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; uint8_t *pABCD,
+; int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+ ;push ebx
+ ;push esi
+ ;push edi
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r5, r5d
+%endif
+
+ ;mov eax, [esp + 12 + 20]
+
+ pxor xmm7, xmm7
+ movd xmm5, [r4]
+ punpcklwd xmm5, xmm5
+ punpckldq xmm5, xmm5
+ movdqa xmm6, xmm5
+ punpcklqdq xmm5, xmm5
+ punpckhqdq xmm6, xmm6
+
+ ;mov eax, [esp + 12 + 4]
+ ;mov edx, [esp + 12 + 8]
+ ;mov esi, [esp + 12 + 12]
+ ;mov edi, [esp + 12 + 16]
+ ;mov ecx, [esp + 12 + 24]
+
+ sub r2, r3 ;sub esi, edi
+ sub r2, r3
+ movdqa xmm7, [h264_d0x20_sse2]
+
+ movdqu xmm0, [r0]
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+ punpcklbw xmm0, xmm1
+
+.hloop_chroma:
+ lea r2, [r2+2*r3]
+
+ movdqu xmm2, [r0+r1]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm4, xmm2
+
+ pmaddubsw xmm0, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm0, xmm2
+ paddw xmm0, xmm7
+ psrlw xmm0, 6
+ packuswb xmm0, xmm0
+ movq [r2],xmm0
+
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqa xmm3, xmm2
+ psrldq xmm3, 1
+ punpcklbw xmm2, xmm3
+ movdqa xmm0, xmm2
+
+ pmaddubsw xmm4, xmm5
+ pmaddubsw xmm2, xmm6
+ paddw xmm4, xmm2
+ paddw xmm4, xmm7
+ psrlw xmm4, 6
+ packuswb xmm4, xmm4
+ movq [r2+r3],xmm4
+
+ sub r5, 2
+ jnz .hloop_chroma
+
+ LOAD_6_PARA_POP
+
+ ;pop edi
+ ;pop esi
+ ;pop ebx
+
+ ret
+
+
--- /dev/null
+++ b/codec/common/mc_luma.asm
@@ -1,0 +1,1293 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* mc_luma.asm
+;*
+;* Abstract
+;* sse2 motion compensation
+;*
+;* History
+;* 17/08/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;*******************************************************************************
+; Local Data (Read Only)
+;*******************************************************************************
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+;*******************************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;*******************************************************************************
+
+ALIGN 16
+h264_w0x10:
+ dw 16, 16, 16, 16
+ALIGN 16
+h264_w0x10_1:
+ dw 16, 16, 16, 16, 16, 16, 16, 16
+ALIGN 16
+h264_mc_hc_32:
+ dw 32, 32, 32, 32, 32, 32, 32, 32
+
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20WidthEq4_mmx
+
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight)
+;*******************************************************************************
+McHorVer20WidthEq4_mmx:
+ ;push esi
+ ;push edi
+
+ ;mov esi, [esp+12]
+ ;mov eax, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+
+ sub r0, 2
+ WELS_Zero mm7
+ movq mm6, [h264_w0x10]
+.height_loop:
+ movd mm0, [r0]
+ punpcklbw mm0, mm7
+ movd mm1, [r0+5]
+ punpcklbw mm1, mm7
+ movd mm2, [r0+1]
+ punpcklbw mm2, mm7
+ movd mm3, [r0+4]
+ punpcklbw mm3, mm7
+ movd mm4, [r0+2]
+ punpcklbw mm4, mm7
+ movd mm5, [r0+3]
+ punpcklbw mm5, mm7
+
+ paddw mm2, mm3
+ paddw mm4, mm5
+ psllw mm4, 2
+ psubw mm4, mm2
+ paddw mm0, mm1
+ paddw mm0, mm4
+ psllw mm4, 2
+ paddw mm0, mm4
+ paddw mm0, mm6
+ psraw mm0, 5
+ packuswb mm0, mm7
+ movd [r2], mm0
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .height_loop
+
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ret
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+
+%macro SSE_LOAD_8P 3
+ movq %1, %3
+ punpcklbw %1, %2
+%endmacro
+
+%macro FILTER_HV_W8 9
+ paddw %1, %6
+ movdqa %8, %3
+ movdqa %7, %2
+ paddw %1, [h264_w0x10_1]
+ paddw %8, %4
+ paddw %7, %5
+ psllw %8, 2
+ psubw %8, %7
+ paddw %1, %8
+ psllw %8, 2
+ paddw %1, %8
+ psraw %1, 5
+ WELS_Zero %8
+ packuswb %1, %8
+ movq %9, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+WELS_EXTERN McHorVer22Width8HorFirst_sse2
+WELS_EXTERN McHorVer02WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq8_sse2
+WELS_EXTERN McHorVer20WidthEq16_sse2
+
+ALIGN 16
+;***********************************************************************
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+; int16_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride
+; int32_t iHeight
+; )
+;***********************************************************************
+McHorVer22Width8HorFirst_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;mov esi, [esp+16] ;pSrc
+ ;mov eax, [esp+20] ;iSrcStride
+ ;mov edi, [esp+24] ;pDst
+ ;mov edx, [esp+28] ;iDstStride
+ ;mov ebx, [esp+32] ;iHeight
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ pxor xmm7, xmm7
+
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
+
+.yloop_width_8:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
+
+ add r0, r1
+ add r2, r3
+ dec r4
+ jnz .yloop_width_8
+ LOAD_5_PARA_POP
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
+; );
+;*******************************************************************************
+McHorVer20WidthEq8_sse2:
+ ;push esi
+ ;push edi
+
+ ;mov esi, [esp + 12] ;pSrc
+ ;mov eax, [esp + 16] ;iSrcStride
+ ;mov edi, [esp + 20] ;pDst
+ ;mov ecx, [esp + 28] ;iHeight
+ ;mov edx, [esp + 24] ;iDstStride
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ lea r0, [r0-2] ;pSrc -= 2;
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
+
+ LOAD_5_PARA_POP
+ ret
+
+ALIGN 16
+;*******************************************************************************
+; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight,
+; );
+;*******************************************************************************
+McHorVer20WidthEq16_sse2:
+ ;push esi
+ ;push edi
+ ;mov esi, [esp + 12] ;pSrc
+ ;mov eax, [esp + 16] ;iSrcStride
+ ;mov edi, [esp + 20] ;pDst
+ ;mov ecx, [esp + 28] ;iHeight
+ ;mov edx, [esp + 24] ;iDstStride
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ lea r0, [r0-2] ;pSrc -= 2;
+
+ pxor xmm7, xmm7
+ movdqa xmm6, [h264_w0x10_1]
+.y_loop:
+
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, xmm6
+ psraw xmm0, 5
+ packuswb xmm0, xmm7
+ movq [r2+8], xmm0
+
+ lea r2, [r2+r3]
+ lea r0, [r0+r1]
+ dec r4
+ jnz near .y_loop
+
+ LOAD_5_PARA_POP
+ ret
+
+
+;*******************************************************************************
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+; int iSrcStride,
+; uint8_t *pDst,
+; int iDstStride,
+; int iHeight )
+;*******************************************************************************
+ALIGN 16
+McHorVer02WidthEq8_sse2:
+ ;push esi
+ ;push edi
+ ;mov esi, [esp + 12] ;pSrc
+ ;mov edx, [esp + 16] ;iSrcStride
+ ;mov edi, [esp + 20] ;pDst
+ ;mov eax, [esp + 24] ;iDstStride
+ ;mov ecx, [esp + 28] ;iHeight
+
+ %assign push_num 0
+ LOAD_5_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+%endif
+ sub r0, r1
+ sub r0, r1
+
+ WELS_Zero xmm7
+
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+.start:
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r4
+ jz near .xx_exit
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r4
+ jz near .xx_exit
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
+
+.xx_exit:
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN McHorVer20Width9Or17_sse2
+WELS_EXTERN McHorVer02Height9Or17_sse2
+WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
+WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
+WELS_EXTERN McHorVer22HorFirst_sse2
+
+
+;***********************************************************************
+; void McHorVer02Height9Or17_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight )
+;***********************************************************************
+ALIGN 16
+McHorVer02Height9Or17_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+
+ ;mov esi, [esp + 16]
+ ;mov edx, [esp + 20]
+ ;mov edi, [esp + 24]
+ ;mov eax, [esp + 28]
+ ;mov ecx, [esp + 36]
+ ;mov ebx, [esp + 32]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+
+ shr r4, 3
+ sub r0, r1
+ sub r0, r1
+
+.xloop:
+ WELS_Zero xmm7
+ SSE_LOAD_8P xmm0, xmm7, [r0]
+ SSE_LOAD_8P xmm1, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm7, [r0]
+ SSE_LOAD_8P xmm3, xmm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm7, [r0]
+ SSE_LOAD_8P xmm5, xmm7, [r0+r1]
+
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ movdqa xmm0,xmm1
+ movdqa xmm1,xmm2
+ movdqa xmm2,xmm3
+ movdqa xmm3,xmm4
+ movdqa xmm4,xmm5
+ movdqa xmm5,xmm6
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm6, xmm7, [r0]
+ FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm7, xmm0, [r0+r1]
+ FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm0, xmm1, [r0]
+ FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm1, xmm2, [r0+r1]
+ FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm2, xmm3, [r0]
+ FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm3, xmm4, [r0+r1]
+ FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ SSE_LOAD_8P xmm4, xmm5, [r0]
+ FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ SSE_LOAD_8P xmm5, xmm6, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .xx_exit
+ ;mov esi, [esp + 16]
+ ;mov edi, [esp + 24]
+ ;mov ecx, [esp + 36]
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ sub r0, r1
+ sub r0, r1
+ add r0, 8
+ add r2, 8
+ jmp near .xloop
+
+.xx_exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ LOAD_6_PARA_POP
+ ret
+
+
+ALIGN 16
+;***********************************************************************
+; void McHorVer20Width9Or17_sse2( uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t *pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight
+; );
+;***********************************************************************
+McHorVer20Width9Or17_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;mov esi, [esp+16]
+ ;mov eax, [esp+20]
+ ;mov edi, [esp+24]
+ ;mov edx, [esp+28]
+ ;mov ecx, [esp+32]
+ ;mov ebx, [esp+36]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+ sub r0, 2
+ pxor xmm7, xmm7
+
+ cmp r4, 9
+ jne near .width_17
+
+.yloop_width_9:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2], xmm0
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+1], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ LOAD_6_PARA_POP
+ ret
+
+
+.width_17:
+.yloop_width_17:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movq [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ paddw xmm0, [h264_w0x10_1]
+ psraw xmm0, 5
+ packuswb xmm0, xmm0
+ movd [r2+8], xmm0
+
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ paddw xmm2, [h264_w0x10_1]
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r2+9], xmm2
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ LOAD_6_PARA_POP
+ ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;void McHorVer22HorFirst_sse2
+; (uint8_t *pSrc,
+; int32_t iSrcStride,
+; uint8_t * pTap,
+; int32_t iTapStride,
+; int32_t iWidth,int32_t iHeight);
+;***********************************************************************
+McHorVer22HorFirst_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;mov esi, [esp+16]
+ ;mov eax, [esp+20]
+ ;mov edi, [esp+24]
+ ;mov edx, [esp+28]
+ ;mov ecx, [esp+32]
+ ;mov ebx, [esp+36]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+ pxor xmm7, xmm7
+ sub r0, r1 ;;;;;;;;need more 5 lines.
+ sub r0, r1
+
+ cmp r4, 9
+ jne near .width_17
+
+.yloop_width_9:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2], xmm0
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+2], xmm2
+ movhps [r2+2+8], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_9
+ LOAD_6_PARA_POP
+ ret
+
+
+.width_17:
+.yloop_width_17:
+ movq xmm0, [r0]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3]
+ punpcklbw xmm5, xmm7
+
+ paddw xmm2, xmm3
+ paddw xmm4, xmm5
+ psllw xmm4, 2
+ psubw xmm4, xmm2
+ paddw xmm0, xmm1
+ paddw xmm0, xmm4
+ psllw xmm4, 2
+ paddw xmm0, xmm4
+ movdqa [r2], xmm0
+
+ movq xmm0, [r0+8]
+ punpcklbw xmm0, xmm7
+ movq xmm1, [r0+5+8]
+ punpcklbw xmm1, xmm7
+ movq xmm2, [r0+1+8]
+ punpcklbw xmm2, xmm7
+ movq xmm3, [r0+4+8]
+ punpcklbw xmm3, xmm7
+ movq xmm4, [r0+2+8]
+ punpcklbw xmm4, xmm7
+ movq xmm5, [r0+3+8]
+ punpcklbw xmm5, xmm7
+
+ movdqa xmm7, xmm2
+ paddw xmm7, xmm3
+ movdqa xmm6, xmm4
+ paddw xmm6, xmm5
+ psllw xmm6, 2
+ psubw xmm6, xmm7
+ paddw xmm0, xmm1
+ paddw xmm0, xmm6
+ psllw xmm6, 2
+ paddw xmm0, xmm6
+ movd [r2+16], xmm0
+
+
+ pxor xmm7, xmm7
+ movq xmm0, [r0+6+8]
+ punpcklbw xmm0, xmm7
+
+ paddw xmm4, xmm1
+ paddw xmm5, xmm3
+ psllw xmm5, 2
+ psubw xmm5, xmm4
+ paddw xmm2, xmm0
+ paddw xmm2, xmm5
+ psllw xmm5, 2
+ paddw xmm2, xmm5
+ movq [r2+18], xmm2
+ movhps [r2+18+8], xmm2
+
+ add r0, r1
+ add r2, r3
+ dec r5
+ jnz .yloop_width_17
+ LOAD_6_PARA_POP
+ ret
+
+
+%macro FILTER_VER 9
+ paddw %1, %6
+ movdqa %7, %2
+ movdqa %8, %3
+
+
+ paddw %7, %5
+ paddw %8, %4
+
+ psubw %1, %7
+ psraw %1, 2
+ paddw %1, %8
+ psubw %1, %7
+ psraw %1, 2
+ paddw %8, %1
+ paddw %8, [h264_mc_hc_32]
+ psraw %8, 6
+ packuswb %8, %8
+ movq %9, %8
+%endmacro
+;***********************************************************************
+;void McHorVer22Width8VerLastAlign_sse2(
+; uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastAlign_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;push ebp
+
+ ;mov esi, [esp+20]
+ ;mov eax, [esp+24]
+ ;mov edi, [esp+28]
+ ;mov edx, [esp+32]
+ ;mov ebx, [esp+36]
+ ;mov ecx, [esp+40]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+
+ shr r4, 3
+
+.width_loop:
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ movdqa xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ movdqa xmm5, [r0+r1]
+
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
+
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqa xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqa xmm5, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .exit
+ ;mov esi, [esp+20]
+ ;mov edi, [esp+28]
+ ;mov ecx, [esp+40]
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
+
+.exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ LOAD_6_PARA_POP
+ ret
+
+;***********************************************************************
+;void McHorVer22Width8VerLastUnAlign_sse2(
+; uint8_t *pTap,
+; int32_t iTapStride,
+; uint8_t * pDst,
+; int32_t iDstStride,
+; int32_t iWidth,
+; int32_t iHeight);
+;***********************************************************************
+
+ McHorVer22Width8VerLastUnAlign_sse2:
+ ;push esi
+ ;push edi
+ ;push ebx
+ ;push ebp
+
+ ;mov esi, [esp+20]
+ ;mov eax, [esp+24]
+ ;mov edi, [esp+28]
+ ;mov edx, [esp+32]
+ ;mov ebx, [esp+36]
+ ;mov ecx, [esp+40]
+
+ %assign push_num 0
+ LOAD_6_PARA
+%ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ movsx r4, r4d
+ movsx r5, r5d
+%endif
+%ifndef X86_32
+ push r12
+ push r13
+ push r14
+ mov r12, r0
+ mov r13, r2
+ mov r14, r5
+%endif
+ shr r4, 3
+
+.width_loop:
+ movdqu xmm0, [r0]
+ movdqu xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ movdqu xmm3, [r0+r1]
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ movdqu xmm5, [r0+r1]
+
+ FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+
+ movdqa xmm0, xmm1
+ movdqa xmm1, xmm2
+ movdqa xmm2, xmm3
+ movdqa xmm3, xmm4
+ movdqa xmm4, xmm5
+ movdqa xmm5, xmm6
+
+ add r2, r3
+ sub r0, r1
+
+.start:
+ FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm6, [r0]
+ FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm7, [r0+r1]
+ FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm0, [r0]
+ FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r0+r1]
+ FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm2, [r0]
+ FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r0+r1]
+ FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r0, [r0+2*r1]
+ movdqu xmm4, [r0]
+ FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
+ dec r5
+ jz near .x_loop_dec
+
+ lea r2, [r2+2*r3]
+ movdqu xmm5, [r0+r1]
+ jmp near .start
+
+.x_loop_dec:
+ dec r4
+ jz near .exit
+ ;mov esi, [esp+20]
+ ;mov edi, [esp+28]
+ ;mov ecx, [esp+40]
+%ifdef X86_32
+ mov r0, arg1
+ mov r2, arg3
+ mov r5, arg6
+%else
+ mov r0, r12
+ mov r2, r13
+ mov r5, r14
+%endif
+ add r0, 16
+ add r2, 8
+ jmp .width_loop
+
+.exit:
+%ifndef X86_32
+ pop r14
+ pop r13
+ pop r12
+%endif
+ LOAD_6_PARA_POP
+ ret
\ No newline at end of file
--- a/codec/common/targets.mk
+++ b/codec/common/targets.mk
@@ -6,6 +6,14 @@
COMMON_OBJS += $(COMMON_CPP_SRCS:.cpp=.o)
ifeq ($(USE_ASM), Yes)
COMMON_ASM_SRCS=\
+ $(COMMON_SRCDIR)/./asm_inc.asm\
+ $(COMMON_SRCDIR)/./cpuid.asm\
+ $(COMMON_SRCDIR)/./deblock.asm\
+ $(COMMON_SRCDIR)/./expand_picture.asm\
+ $(COMMON_SRCDIR)/./mb_copy.asm\
+ $(COMMON_SRCDIR)/./mc_chroma.asm\
+ $(COMMON_SRCDIR)/./mc_luma.asm\
+ $(COMMON_SRCDIR)/./vaa.asm\
COMMON_OBJS += $(COMMON_ASM_SRCS:.asm=.o)
endif
@@ -13,6 +21,30 @@
OBJS += $(COMMON_OBJS)
$(COMMON_SRCDIR)/./logging.o: $(COMMON_SRCDIR)/./logging.cpp
$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(COMMON_CFLAGS) $(COMMON_INCLUDES) -c -o $(COMMON_SRCDIR)/./logging.o $(COMMON_SRCDIR)/./logging.cpp
+
+$(COMMON_SRCDIR)/./asm_inc.o: $(COMMON_SRCDIR)/./asm_inc.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./asm_inc.o $(COMMON_SRCDIR)/./asm_inc.asm
+
+$(COMMON_SRCDIR)/./cpuid.o: $(COMMON_SRCDIR)/./cpuid.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./cpuid.o $(COMMON_SRCDIR)/./cpuid.asm
+
+$(COMMON_SRCDIR)/./deblock.o: $(COMMON_SRCDIR)/./deblock.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./deblock.o $(COMMON_SRCDIR)/./deblock.asm
+
+$(COMMON_SRCDIR)/./expand_picture.o: $(COMMON_SRCDIR)/./expand_picture.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./expand_picture.o $(COMMON_SRCDIR)/./expand_picture.asm
+
+$(COMMON_SRCDIR)/./mb_copy.o: $(COMMON_SRCDIR)/./mb_copy.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./mb_copy.o $(COMMON_SRCDIR)/./mb_copy.asm
+
+$(COMMON_SRCDIR)/./mc_chroma.o: $(COMMON_SRCDIR)/./mc_chroma.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./mc_chroma.o $(COMMON_SRCDIR)/./mc_chroma.asm
+
+$(COMMON_SRCDIR)/./mc_luma.o: $(COMMON_SRCDIR)/./mc_luma.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./mc_luma.o $(COMMON_SRCDIR)/./mc_luma.asm
+
+$(COMMON_SRCDIR)/./vaa.o: $(COMMON_SRCDIR)/./vaa.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(COMMON_ASMFLAGS) $(COMMON_ASM_INCLUDES) -o $(COMMON_SRCDIR)/./vaa.o $(COMMON_SRCDIR)/./vaa.asm
$(LIBPREFIX)common.$(LIBSUFFIX): $(COMMON_OBJS)
rm -f $(LIBPREFIX)common.$(LIBSUFFIX)
--- /dev/null
+++ b/codec/common/vaa.asm
@@ -1,0 +1,425 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* vaa.asm
+;*
+;* Abstract
+;* sse2 for pVaa routines
+;*
+;* History
+;* 04/14/2010 Created
+;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
+;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
+
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [r0 ] ; line 0
+ movdqa %2, [r0+r1] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [r0+r2] ; line 2
+ movdqa %4, [r0+r3] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $4
+%endmacro
+
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+; dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+; , 6/7/2010
+
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENTION r1,r1d
+
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num push_num+4
+%endif
+
+ mov r5,r7
+ and r5,0fh
+ sub r7,r5
+ sub r7,32
+
+
+ mov r2,r1
+ sal r2,$1 ;r2 = 2*iLineSize
+ mov r3,r2
+ add r3,r1 ;r3 = 3*iLineSize
+
+ mov r4,r2
+ sal r4,$1 ;r4 = 4*iLineSize
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7], xmm0
+
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+8], xmm0
+
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+16], xmm0
+
+ lea r0, [r0+r4]
+ VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+24], xmm0
+
+ movdqa xmm0, [r7] ; block 0~7
+ movdqa xmm1, [r7+16] ; block 8~15
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+
+
+ movd r2d, xmm0
+ and r2, 0ffffh ; effective low work truncated
+ mov r3, r2
+ imul r2, r3
+ sar r2, $4
+ movd retrd, xmm1
+ sub retrd, r2d
+
+ add r7,32
+ add r7,r5
+
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+
+ ret
+
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENTION r1,r1d
+
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+ push r6
+ %assign push_num push_num+4
+%endif
+
+ mov r5,r7
+ and r5,0fh
+ sub r7,r5
+ sub r7,32
+
+
+ mov r2,r1
+ sal r2,$1 ;r2 = 2*iLineSize
+ mov r3,r2
+ add r3,r1 ;r3 = 3*iLineSize
+
+ mov r4,r2
+ sal r4,$1 ;r4 = 4*iLineSize
+
+ pxor xmm7, xmm7
+
+ ; loops
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7],xmm0
+
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [r7+8],xmm1
+
+
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+ movq [r7+16],xmm0
+
+ lea r0,[r0+r4]
+ VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+ movq [r7+24],xmm1
+
+
+ movdqa xmm0,[r7]
+ movdqa xmm1,[r7+16]
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
+
+ pmullw xmm1, xmm1
+ pmullw xmm2, xmm2
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+ punpcklwd xmm1, xmm7
+ punpckhwd xmm3, xmm7
+ punpcklwd xmm2, xmm7
+ punpckhwd xmm4, xmm7
+ paddd xmm1, xmm2
+ paddd xmm3, xmm4
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 01Bh
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+
+
+ movd r2d, xmm0
+ and r2, 0ffffh ; effective low work truncated
+ mov r3, r2
+ imul r2, r3
+ sar r2, $4
+ movd retrd, xmm1
+ sub retrd, r2d
+
+ add r7,32
+ add r7,r5
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+ pop r3
+%endif
+
+ ret
+
+WELS_EXTERN MdInterAnalysisVaaInfo_sse41
+;***********************************************************************
+; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
+;***********************************************************************
+ALIGN 16
+MdInterAnalysisVaaInfo_sse41:
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0,[r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
+ pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
+ pshufd xmm4, xmm3, 01Bh
+ paddd xmm4, xmm3
+ pshufd xmm3, xmm4, 0B1h
+ paddd xmm3, xmm4
+ movd r0d, xmm3
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
+.threshold_exit:
+ mov retrd, 15
+ ret
+
+WELS_EXTERN MdInterAnalysisVaaInfo_sse2
+;***********************************************************************
+; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
+;***********************************************************************
+ALIGN 16
+MdInterAnalysisVaaInfo_sse2:
+ %assign push_num 0
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ pshufd xmm1, xmm0, 01Bh
+ paddd xmm1, xmm0
+ pshufd xmm2, xmm1, 0B1h
+ paddd xmm1, xmm2
+ psrad xmm1, 02h ; iAverageSad
+ movdqa xmm2, xmm1
+ psrad xmm2, 06h
+ movdqa xmm3, xmm0 ; iSadBlock
+ psrad xmm3, 06h
+ psubd xmm3, xmm2
+
+ ; to replace pmulld functionality as below
+ movdqa xmm2, xmm3
+ pmuludq xmm2, xmm3
+ pshufd xmm4, xmm3, 0B1h
+ pmuludq xmm4, xmm4
+ movdqa xmm5, xmm2
+ punpckldq xmm5, xmm4
+ punpckhdq xmm2, xmm4
+ punpcklqdq xmm5, xmm2
+
+ pshufd xmm4, xmm5, 01Bh
+ paddd xmm4, xmm5
+ pshufd xmm5, xmm4, 0B1h
+ paddd xmm5, xmm4
+
+ movd r0d, xmm5
+ cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
+ jb near .threshold_exit
+ pshufd xmm0, xmm0, 01Bh
+ pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
+ movmskps retrd, xmm0
+ ret
+.threshold_exit:
+ mov retrd, 15
+ ret
--- a/codec/decoder/core/asm/asm_inc.asm
+++ /dev/null
@@ -1,235 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* sse2inc.asm
-;*
-;* Abstract
-;* macro and constant
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
- %define MOVDQ movdqa
-%else
- %define MOVDQ movdqu
-%endif
-
-%if 1
- %define WELSEMMS emms
-%else
- %define WELSEMMS
-%endif
-
-BITS 32
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-%macro WELS_EXTERN 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
-%endmacro
-
-%macro WELS_AbsW 2
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
-
-%macro MMX_XSwap 4
- movq %4, %2
- punpckh%1 %4, %3
- punpckl%1 %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
- MMX_XSwap wd, %1, %2, %5
- MMX_XSwap wd, %3, %4, %2
- MMX_XSwap dq, %1, %3, %4
- MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
- movdqa %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
- SSE2_XSawp dq, %1, %2, %5
- SSE2_XSawp dq, %3, %4, %2
- SSE2_XSawp qdq, %1, %3, %4
- SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
- SSE2_XSawp wd, %1, %2, %5
- SSE2_XSawp wd, %3, %4, %2
- SSE2_XSawp dq, %1, %3, %4
- SSE2_XSawp dq, %5, %2, %3
- SSE2_XSawp qdq, %1, %5, %2
- SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in: m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
- movdqa %9, %8
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %9, %4
- SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %9
- movdqa %9, %3
- SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %9
- movdqa %9, %5
- SSE2_XSawp dq, %7, %3, %5
-
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %9
- movdqa %9, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
- movq %1, %4
- punpcklbw %1, %3
- movq %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
- movdqa %3, %2
- paddw %2, %1
- psubw %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro SSE2_Copy8Times 2
- movd %1, %2
- punpcklwd %1, %1
- pshufd %1, %1, 0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro SSE2_Copy16Times 2
- movd %1, %2
- pshuflw %1, %1, 0
- punpcklqdq %1, %1
- packuswb %1, %1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
- pcmpeqw %1,%1
- psrlw %1,15
- psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
- pcmpeqw %1,%1
- psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro WELS_Zero 1
- pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
- pcmpeqw %1,%1
- psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
- pcmpeqw %1,%1
- psrlw %1,15
- packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -42,263 +42,7 @@
%include "asm_inc.asm"
-BITS 32
-
;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%macro BLOCK_ADD_16_SSE2 4
- movdqa xmm0, [%2]
- movdqa xmm1, [%3]
- movdqa xmm2, [%3+10h]
- movdqa xmm6, xmm0
-
- punpcklbw xmm0, xmm7
- punpckhbw xmm6, xmm7
-
- paddw xmm0, xmm1
- paddw xmm6, xmm2
-
- packuswb xmm0, xmm6
- movdqa [%1], xmm0
-
- lea %2, [%2+%4]
- lea %3, [%3+%4*2]
- lea %1, [%1+%4]
-%endmacro
-
-%macro BLOCK_ADD_8_MMXEXT 4
- movq mm0, [%2]
- movq mm1, [%3]
- movq mm2, [%3+08h]
- movq mm6, mm0
-
- punpcklbw mm0, mm7
- punpckhbw mm6, mm7
-
- paddw mm0, mm1
- paddw mm6, mm2
-
- packuswb mm0, mm6
- movq [%1], mm0
-
- lea %2, [%2+%4]
- lea %3, [%3+%4*2]
- lea %1, [%1+%4]
-%endmacro
-
-
-%macro BLOCK_ADD_16_STRIDE_SSE2 5
- movdqa xmm0, [%2]
- movdqa xmm1, [%3]
- movdqa xmm2, [%3+10h]
- movdqa xmm6, xmm0
-
- punpcklbw xmm0, xmm7
- punpckhbw xmm6, xmm7
-
- paddw xmm0, xmm1
- paddw xmm6, xmm2
-
- packuswb xmm0, xmm6
- movdqa [%1], xmm0
-
- lea %2, [%2+%4]
- lea %3, [%3+%5*2]
- lea %1, [%1+%4]
-%endmacro
-
-
-%macro BLOCK_ADD_8_STRIDE_MMXEXT 5
- movq mm0, [%2]
- movq mm1, [%3]
- movq mm2, [%3+08h]
- movq mm6, mm0
-
- punpcklbw mm0, mm7
- punpckhbw mm6, mm7
-
- paddw mm0, mm1
- paddw mm6, mm2
-
- packuswb mm0, mm6
- movq [%1], mm0
-
- lea %2, [%2+%4]
- lea %3, [%3+%5*2]
- lea %1, [%1+%4]
-%endmacro
-
-%macro BLOCK_ADD_8_STRIDE_2_LINES_SSE2 5
- movdqa xmm1, [%3]
- movq xmm0, [%2]
- punpcklbw xmm0, xmm7
- paddw xmm0, xmm1
- packuswb xmm0, xmm7
- movq [%1], xmm0
-
- movdqa xmm3, [%3+%5*2]
- movq xmm2, [%2+%4]
- punpcklbw xmm2, xmm7
- paddw xmm2, xmm3
- packuswb xmm2, xmm7
- movq [%1+%4], xmm2
-
- lea %1, [%1+%4*2]
- lea %2, [%2+%4*2]
- lea %3, [%3+%5*4]
-%endmacro
-
-%macro CHECK_DATA_16_ZERO_SSE4 3
- mov eax, 0h
- movdqa xmm0, [%1]
- movdqa xmm1, [%1+10h]
- mov ebx, [ecx]
-
- por xmm0, xmm1
- ptest xmm7, xmm0
- cmovae eax, %3
-
- add %1, 20h
- add ecx, 04h
- mov byte [%2+ebx], al
-%endmacro
-
-%macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE4 5
- movdqa xmm0, [%1]
- movdqa xmm1, [%1+%3]
- movdqa xmm2, [%1+%3*2]
- movdqa xmm3, [%1+%4]
-
- mov eax, 0h
- mov ebx, 0h
- movdqa xmm4, xmm0
- movdqa xmm5, xmm2
-
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm4, xmm1
- punpcklqdq xmm2, xmm3
- punpckhqdq xmm5, xmm3
-
- por xmm0, xmm2
- por xmm4, xmm5
-
- ptest xmm7, xmm0
- cmovae eax, %5
- ptest xmm7, xmm4
- cmovae ebx, %5
-
- mov byte [%2], al
- mov byte [%2+1], bl
-%endmacro
-
-%macro DATA_COPY_16x2_SSE2 3
- movdqa xmm0, [%1]
- movdqa xmm1, [%1+10h]
- movdqa xmm2, [%1+%3]
- movdqa xmm3, [%1+%3+10h]
-
- movdqa [%2], xmm0
- movdqa [%2+10h], xmm1
- movdqa [%2+20h], xmm2
- movdqa [%2+30h], xmm3
-
- lea %1, [%1+%3*2]
- lea %2, [%2+40h]
-%endmacro
-
-
-%macro DATA_COPY_8x4_SSE2 4
- movdqa xmm0, [%1]
- movdqa xmm1, [%1+%3]
- movdqa xmm2, [%1+%3*2]
- movdqa xmm3, [%1+%4]
-
- movdqa [%2], xmm0
- movdqa [%2+10h], xmm1
- movdqa [%2+20h], xmm2
- movdqa [%2+30h], xmm3
-
- lea %1, [%1+%3*4]
- lea %2, [%2+40h]
-%endmacro
-
-
-%macro CHECK_DATA_16_ZERO_SSE2 3
- mov eax, 0h
- movdqa xmm0, [%1]
- movdqa xmm1, [%1+10h]
- mov ebx, [ecx]
-
- pcmpeqw xmm0, xmm7
- pcmpeqw xmm1, xmm7
- packsswb xmm0, xmm1
- pmovmskb edx, xmm0
- sub edx, 0ffffh
-
- cmovb eax, ebp
- add ecx, 4
- add %1, 20h
- mov byte [%2+ebx], al
-%endmacro
-
-
-
-%macro CHECK_RS_4x4_BLOCK_2_ZERO_SSE2 5
- movdqa xmm0, [%1]
- movdqa xmm1, [%1 + %3]
- movdqa xmm2, [%1 + %3*2]
- movdqa xmm3, [%1 + %4]
-
- movdqa xmm4, xmm0
- movdqa xmm5, xmm2
-
- punpcklqdq xmm0, xmm1
- punpckhqdq xmm4, xmm1
- punpcklqdq xmm2, xmm3
- punpckhqdq xmm5, xmm3
-
- pcmpeqw xmm0, xmm7
- pcmpeqw xmm2, xmm7
- pcmpeqw xmm4, xmm7
- pcmpeqw xmm5, xmm7
-
- packsswb xmm0, xmm2
- packsswb xmm4, xmm5
- pmovmskb eax, xmm0
- pmovmskb ebx, xmm4
-
- sub eax, 0ffffh
- mov eax, 0
- cmovb eax, %5
- sub ebx, 0ffffh
- mov ebx, 0
- cmovb ebx, %5
- mov byte [%2], al
- mov byte [%2+1], bl
-%endmacro
-
-;*******************************************************************************
-; Data
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-ALIGN 16
-SubMbScanIdx:
- dd 0x0, 0x1, 0x4, 0x5,
- dd 0x2, 0x3, 0x6, 0x7,
- dd 0x8, 0x9, 0xc, 0xd,
- dd 0xa, 0xb, 0xe, 0xf,
- dd 0x10, 0x11, 0x14, 0x15,
- dd 0x12, 0x13, 0x16, 0x17,
-
-;*******************************************************************************
; Code
;*******************************************************************************
@@ -312,71 +56,77 @@
; void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
;*******************************************************************************
WelsResBlockZero16x16_sse2:
- push esi
+ ;push r0
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ ;mov r0, [esp+08h]
+ ;mov r1, [esp+0ch]
+ ;lea r1, [r1*2]
+ lea r1, [r1*2]
+ ;lea r2, [r1*3]
+ lea r2, [r1*3]
- mov esi, [esp+08h]
- mov ecx, [esp+0ch]
- lea ecx, [ecx*2]
- lea eax, [ecx*3]
-
pxor xmm7, xmm7
; four lines
- movdqa [esi], xmm7
- movdqa [esi+10h], xmm7
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
- movdqa [esi+ecx], xmm7
- movdqa [esi+ecx+10h], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
- movdqa [esi+ecx*2], xmm7
- movdqa [esi+ecx*2+10h], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
- movdqa [esi+eax], xmm7
- movdqa [esi+eax+10h], xmm7
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
; four lines
- lea esi, [esi+ecx*4]
- movdqa [esi], xmm7
- movdqa [esi+10h], xmm7
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
- movdqa [esi+ecx], xmm7
- movdqa [esi+ecx+10h], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
- movdqa [esi+ecx*2], xmm7
- movdqa [esi+ecx*2+10h], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
- movdqa [esi+eax], xmm7
- movdqa [esi+eax+10h], xmm7
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
; four lines
- lea esi, [esi+ecx*4]
- movdqa [esi], xmm7
- movdqa [esi+10h], xmm7
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
- movdqa [esi+ecx], xmm7
- movdqa [esi+ecx+10h], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
- movdqa [esi+ecx*2], xmm7
- movdqa [esi+ecx*2+10h], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
- movdqa [esi+eax], xmm7
- movdqa [esi+eax+10h], xmm7
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
; four lines
- lea esi, [esi+ecx*4]
- movdqa [esi], xmm7
- movdqa [esi+10h], xmm7
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+10h], xmm7
- movdqa [esi+ecx], xmm7
- movdqa [esi+ecx+10h], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1+10h], xmm7
- movdqa [esi+ecx*2], xmm7
- movdqa [esi+ecx*2+10h], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r1*2+10h], xmm7
- movdqa [esi+eax], xmm7
- movdqa [esi+eax+10h], xmm7
+ movdqa [r0+r2], xmm7
+ movdqa [r0+r2+10h], xmm7
- pop esi
+ ;pop r0
ret
@@ -387,27 +137,31 @@
; void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
;*******************************************************************************
WelsResBlockZero8x8_sse2:
- push esi
+ ;push r0
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ ;mov r0, [esp+08h]
+ ;mov r1, [esp+0ch]
+ lea r1, [r1*2]
+ lea r2, [r1*3]
- mov esi, [esp+08h]
- mov ecx, [esp+0ch]
- lea ecx, [ecx*2]
- lea eax, [ecx*3]
-
pxor xmm7, xmm7
- movdqa [esi], xmm7
- movdqa [esi+ecx], xmm7
- movdqa [esi+ecx*2], xmm7
- movdqa [esi+eax], xmm7
+ movdqa [r0], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r2], xmm7
- lea esi, [esi+ecx*4]
- movdqa [esi], xmm7
- movdqa [esi+ecx], xmm7
- movdqa [esi+ecx*2], xmm7
- movdqa [esi+eax], xmm7
+ lea r0, [r0+r1*4]
+ movdqa [r0], xmm7
+ movdqa [r0+r1], xmm7
+ movdqa [r0+r1*2], xmm7
+ movdqa [r0+r2], xmm7
- pop esi
+ ;pop r0
ret
--- a/codec/decoder/core/asm/cpuid.asm
+++ /dev/null
@@ -1,169 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* cpu_mmx.asm
-;*
-;* Abstract
-;* verify cpuid feature support and cpuid detection
-;*
-;* History
-;* 04/29/2009 Created
-;*
-;*************************************************************************/
-
-bits 32
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-%macro WELS_EXTERN 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
-%endmacro
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
-;******************************************************************************************
-; int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WelsCPUIdVerify:
- pushfd ; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
- pushfd ; need push 2 EFLAGS, one for processing and the another one for storing purpose
- pop ecx ; get EFLAGS to bit manipulation
- mov eax, ecx ; store into ecx followed
- xor eax, 00200000h ; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
- xor eax, ecx ; get the ID flag bitwise, eax - 0: not support; otherwise: support
- popfd ; store back EFLAGS and keep unchanged for system
- ret
-
-WELS_EXTERN WelsCPUId
-ALIGN 16
-;****************************************************************************************************
-; void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
-;****************************************************************************************************
-WelsCPUId:
- push ebx
- push edi
-
- mov eax, [esp+12] ; operating index
- cpuid ; cpuid
-
- ; processing various information return
- mov edi, [esp+16]
- mov [edi], eax
- mov edi, [esp+20]
- mov [edi], ebx
- mov edi, [esp+24]
- mov [edi], ecx
- mov edi, [esp+28]
- mov [edi], edx
-
- pop edi
- pop ebx
- ret
-
-WELS_EXTERN WelsCPUSupportAVX
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportAVX:
- mov eax, [esp+4]
- mov ecx, [esp+8]
-
- ; refer to detection of AVX addressed in INTEL AVX manual document
- and ecx, 018000000H
- cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
- jne avx_not_supported
- ; processor supports AVX instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne avx_not_supported
- mov eax, 1
- ret
-avx_not_supported:
- mov eax, 0
- ret
-
-WELS_EXTERN WelsCPUSupportFMA
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportFMA:
- mov eax, [esp+4]
- mov ecx, [esp+8]
-
- ; refer to detection of FMA addressed in INTEL AVX manual document
- and ecx, 018001000H
- cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
- jne fma_not_supported
- ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne fma_not_supported
- mov eax, 1
- ret
-fma_not_supported:
- mov eax, 0
- ret
-
-WELS_EXTERN WelsEmms
-ALIGN 16
-;******************************************************************************************
-; void WelsEmms()
-;******************************************************************************************
-WelsEmms:
- emms ; empty mmx technology states
- ret
-
-
-
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -42,8 +42,6 @@
%include "asm_inc.asm"
-BITS 32
-
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
@@ -93,20 +91,16 @@
;*******************************************************************************
IdctResAddPred_mmx:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ movq mm0, [r2+ 0]
+ movq mm1, [r2+ 8]
+ movq mm2, [r2+16]
+ movq mm3, [r2+24]
-%define pushsize 0
-%define pPred esp+pushsize+4
-%define kiStride esp+pushsize+8
-%define pRs esp+pushsize+12
-
- mov eax, [pRs ]
- mov edx, [pPred ]
- mov ecx, [kiStride]
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
-
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
@@ -115,15 +109,12 @@
WELS_Zero mm7
WELS_DW32 mm6
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx]
- lea edx, [edx+2*ecx]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx]
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
+ lea r0, [r0+2*r1]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
-%undef pushsize
-%undef pPred
-%undef kiStride
-%undef pRs
+
emms
ret
--- a/codec/decoder/core/asm/deblock.asm
+++ /dev/null
@@ -1,2113 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
-
-ALIGN 16
-DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_sse2
-
-ALIGN 16
-
-DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_sse2
-
-ALIGN 16
-
-DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_sse2
-
-ALIGN 16
-
-DeblockLumaLt4V_sse2:
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN DeblockLumaEq4V_sse2
-
-ALIGN 16
-
-DeblockLumaEq4V_sse2:
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
- push ebp
- push ebx
- mov ebp, esp
- and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
- mov ecx, [ebp + 10h]
- lea edx, [eax + ecx * 8]
- lea ebx, [ecx*3]
-
- movq xmm0, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
- movq xmm1, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [eax + ebx]
- movq xmm7, [edx + ebx]
- punpcklqdq xmm3, xmm7
-
- lea eax, [eax + ecx * 4]
- lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
- movq xmm5, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [esp], xmm0
- movq xmm7, [eax + ebx]
- movq xmm0, [edx + ebx]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [esp]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- mov eax, [ebp + 14h]
- movdqa [eax], xmm4
- movdqa [eax + 10h], xmm2
- movdqa [eax + 20h], xmm3
- movdqa [eax + 30h], xmm7
- movdqa [eax + 40h], xmm5
- movdqa [eax + 50h], xmm1
- movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
- mov esp, ebp
- pop ebx
- pop ebp
- ret
-
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
- push ebp
- mov ebp, esp
-
- and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
- mov ecx, [ebp + 0Ch]
- mov edx, [ebp + 08h]
-
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 10h]
- movdqa xmm2, [eax + 20h]
- movdqa xmm3, [eax + 30h]
- movdqa xmm4, [eax + 40h]
- movdqa xmm5, [eax + 50h]
- movdqa xmm6, [eax + 60h]
- movdqa xmm7, [eax + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea eax, [ecx * 3]
-
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
- mov esp, ebp
- pop ebp
- ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ /dev/null
@@ -1,655 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* expand_picture.asm
-;*
-;* Abstract
-;* mmxext/sse for expand_frame
-;*
-;* History
-;* 09/25/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-;%define PADDING_SIZE_ASM 32 ; PADDING_LENGTH
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-
-SECTION .text
-
-;WELS_EXTERN expand_picture_luma_mmx
-;WELS_EXTERN expand_picture_chroma_mmx
-WELS_EXTERN ExpandPictureLuma_sse2
-WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
-WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
-
-;;;;;;;expanding result;;;;;;;
-
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;----------------------------
-;aaaa|attttttttttttttttb|bbbb
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;----------------------------
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-
-%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+%2]
-%endmacro
-
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
- ; ebx [width/16(8)]
- ; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
- ; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
-
-%if %1 == 32 ; for luma
- sar ebx, 04h ; width / 16(8) pixels
-.top_bottom_loops:
- ; top
- movdqa xmm0, [esi] ; first line of picture pData
- mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
- ; bottom
- movdqa xmm1, [eax] ; last line of picture pData
- mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
- lea esi, [esi+16] ; top pSrc
- lea edi, [edi+16] ; top dst
- lea eax, [eax+16] ; bottom pSrc
- lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
- dec ebx
- jnz near .top_bottom_loops
-%elif %1 == 16 ; for chroma ??
- mov edx, ebx
- sar ebx, 04h ; (width / 16) pixels
-.top_bottom_loops:
- ; top
- movdqa xmm0, [esi] ; first line of picture pData
- mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
- ; bottom
- movdqa xmm1, [eax] ; last line of picture pData
- mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
- lea esi, [esi+16] ; top pSrc
- lea edi, [edi+16] ; top dst
- lea eax, [eax+16] ; bottom pSrc
- lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
- dec ebx
- jnz near .top_bottom_loops
-
- ; for remaining 8 bytes
- and edx, 0fh ; any 8 bytes left?
- test edx, edx
- jz near .to_be_continued ; no left to exit here
-
- ; top
- movq mm0, [esi] ; remained 8 byte
- mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- mov_line_end8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- ; bottom
- movq mm1, [eax]
- mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- mov_line_end8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- WELSEMMS
-
-.to_be_continued:
-%endif
-%endmacro
-
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; ecx [height]
- ; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
- ; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
-; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32 ; for luma
-.left_right_loops:
- ; left
- mov al, byte [esi] ; pixel pData for left border
- butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
- movdqa [edi+16], xmm0
-
- ; right
- mov al, byte [ebx]
- butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [ebp], xmm1
- movdqa [ebp+16], xmm1
-
- lea esi, [esi+edx] ; left pSrc
- lea edi, [edi+edx] ; left dst
- lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
- dec ecx
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
-.left_right_loops:
- ; left
- mov al, byte [esi] ; pixel pData for left border
- butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
-
- ; right
- mov al, byte [ebx]
- butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
- lea esi, [esi+edx] ; left pSrc
- lea edi, [edi+edx] ; left dst
- lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
- dec ecx
- jnz near .left_right_loops
-%endif
-%endmacro
-
-%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
- ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
-%if %1 == 32 ; luma
- ; TL
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_end32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
-
- ; TR
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_end32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
-
- ; BL
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_end32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
-
- ; BR
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_end32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
-%elif %1 == 16 ; chroma
- ; TL
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
-
- ; TR
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
-
- ; BL
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
-
- ; BR
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
-%endif
-%endmacro
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureLuma_sse2( uint8_t *pDst,
-; const int32_t kiStride,
-; const int32_t kiWidth,
-; const int32_t kiHeight );
-;***********************************************************************----------------
-ExpandPictureLuma_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- ; for both top and bottom border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; pDst
- mov edx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov eax, [esp+36] ; kiHeight
- ; also prepare for cross border pData top-left: xmm3
-; xor ecx, ecx
- mov cl, byte [esi]
- butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
- ; load top border
- mov ecx, edx ; kiStride
- neg ecx ; -kiStride
- lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
- dec eax ; h-1
- imul eax, edx ; (h-1)*kiStride
- lea eax, [esi+eax] ; last line of picture pData
- sal edx, 05h ; 32*kiStride
- lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
- ; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
- dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
-; xor edx, edx
- mov dl, byte [eax] ; bottom-left
- butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- mov dl, byte [ebx] ; bottom-right
- butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
- mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 32
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst: left border pSrc
- mov edx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov ecx, [esp+36] ; kiHeight
- ; load left border
- mov eax, -32 ; luma=-32, chroma=-16
- lea edi, [esi+eax] ; left border dst
- dec ebx
- lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
- ; prepare for cross border pData: top-right with xmm4
-; xor eax, eax
- mov al, byte [ebx] ; top-right
- butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
- ; for left & right border expanding
- exp_left_right_sse2 32, a
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; pDst
- mov ecx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov edx, [esp+36] ; kiHeight
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
- mov eax, -32 ; luma=-32, chroma=-16
- neg ecx ; -stride
- lea edi, [esi+eax]
- lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
- lea ebp, [ebp+ecx] ; last line of top-right border
- add edx, 32 ; height+32(16), luma=32, chroma=16
- mov ecx, [esp+28] ; kiStride
- imul edx, ecx ; (height+32(16)) * stride
- lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
- neg ecx ; -kiStride
- ; for left & right border expanding
- exp_cross_sse2 32, a
-
-; sfence ; commit cache write back memory
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
-
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
-; const int32_t kiStride,
-; const int32_t kiWidth,
-; const int32_t kiHeight );
-;***********************************************************************----------------
-ExpandPictureChromaAlign_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- ; for both top and bottom border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; pDst
- mov edx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov eax, [esp+36] ; kiHeight
- ; also prepare for cross border pData top-left: xmm3
-; xor ecx, ecx
- mov cl, byte [esi]
- butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
- ; load top border
- mov ecx, edx ; kiStride
- neg ecx ; -kiStride
- lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
- dec eax ; h-1
- imul eax, edx ; (h-1)*kiStride
- lea eax, [esi+eax] ; last line of picture pData
- sal edx, 04h ; 16*kiStride
- lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
- ; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
- dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; pDst[w-1][h-1]
-; xor edx, edx
- mov dl, byte [eax] ; bottom-left
- butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- mov dl, byte [ebx] ; bottom-right
- butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
- mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 16
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; pDst: left border pSrc
- mov edx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov ecx, [esp+36] ; kiHeight
- ; load left border
- mov eax, -16 ; luma=-32, chroma=-16
- lea edi, [esi+eax] ; left border dst
- dec ebx
- lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
- ; prepare for cross border pData: top-right with xmm4
-; xor eax, eax
- mov al, byte [ebx] ; top-right
- butterfly_1to16_sse xmm4, xmm0, a ; pDst, tmp, pSrc [generic register name: a/b/c/d]
- ; for left & right border expanding
- exp_left_right_sse2 16, a
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; pDst
- mov ecx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov edx, [esp+36] ; kiHeight
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
- mov eax, -16 ; chroma=-16
- neg ecx ; -stride
- lea edi, [esi+eax]
- lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
- lea ebp, [ebp+ecx] ; last line of top-right border
- mov ecx, [esp+28] ; kiStride
- add edx, 16 ; height+16, luma=32, chroma=16
- imul edx, ecx ; (kiHeight+16) * kiStride
- lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
- neg ecx ; -kiStride
- ; for left & right border expanding
- exp_cross_sse2 16, a
-
-; sfence ; commit cache write back memory
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
-
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
-; const int32_t kiStride,
-; const int32_t kiWidth,
-; const int32_t kiHeight );
-;***********************************************************************----------------
-ExpandPictureChromaUnalign_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- ; for both top and bottom border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; pDst
- mov edx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov eax, [esp+36] ; kiHeight
- ; also prepare for cross border pData top-left: xmm3
-; xor ecx, ecx
- mov cl, byte [esi]
- butterfly_1to16_sse xmm3, xmm4, c ; pDst, tmp, pSrc [generic register name: a/b/c/d]
- ; load top border
- mov ecx, edx ; kiStride
- neg ecx ; -kiStride
- lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
- dec eax ; h-1
- imul eax, edx ; (h-1)*kiStride
- lea eax, [esi+eax] ; last line of picture pData
- sal edx, 04h ; 16*kiStride
- lea ebp, [eax+edx] ; last line of bottom border, (h-1)*kiStride + 16 * kiStride
- ; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
- dec ebx ; kiWidth-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
-; xor edx, edx
- mov dl, byte [eax] ; bottom-left
- butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- mov dl, byte [ebx] ; bottom-right
- butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
- mov ebx, [esp+32] ; kiWidth
- exp_top_bottom_sse2 16
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst: left border pSrc
- mov edx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov ecx, [esp+36] ; kiHeight
- ; load left border
- mov eax, -16 ; luma=-32, chroma=-16
- lea edi, [esi+eax] ; left border dst
- dec ebx
- lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
- ; prepare for cross border pData: top-right with xmm4
-; xor eax, eax
- mov al, byte [ebx] ; top-right
- butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for left & right border expanding
- exp_left_right_sse2 16, u
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov ecx, [esp+28] ; kiStride
- mov ebx, [esp+32] ; kiWidth
- mov edx, [esp+36] ; kiHeight
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
- neg ecx ; -kiStride
- mov eax, -16 ; chroma=-16
- lea edi, [esi+eax]
- lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
- lea ebp, [ebp+ecx] ; last line of top-right border
- mov ecx, [esp+28] ; kiStride
- add edx, 16 ; kiHeight+16, luma=32, chroma=16
- imul edx, ecx ; (kiHeight+16) * kiStride
- lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
- neg ecx ; -kiStride
- ; for left & right border expanding
- exp_cross_sse2 16, u
-
-; sfence ; commit cache write back memory
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
-
- ret
-
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -45,7 +45,6 @@
;*************************************************************************/
%include "asm_inc.asm"
-BITS 32
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
@@ -166,11 +165,11 @@
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01]
- add ebx, edx
- movzx edx, byte [eax+ecx-0x01]
- add ebx, edx
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01]
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01]
+ add r2, r3
%endmacro
;*******************************************************************************
@@ -190,32 +189,37 @@
; pPred must align to 16
;*******************************************************************************
WelsI4x4LumaPredH_sse2:
- mov eax, [esp+4] ;pPred
- mov ecx, [esp+8] ;kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ ;mov eax, [esp+4] ;pPred
+ ;mov ecx, [esp+8] ;kiStride
- movzx edx, byte [eax-1]
- movd xmm0, edx
+ movzx r2, byte [r0-1]
+ movd xmm0, r2d
pmuludq xmm0, [mmx_01bytes]
- movzx edx, byte [eax+ecx-1]
- movd xmm1, edx
+ movzx r2, byte [r0+r1-1]
+ movd xmm1, r2d
pmuludq xmm1, [mmx_01bytes]
- lea eax, [eax+ecx]
- movzx edx, byte [eax+ecx-1]
- movd xmm2, edx
+ lea r0, [r0+r1]
+ movzx r2, byte [r0+r1-1]
+ movd xmm2, r2d
pmuludq xmm2, [mmx_01bytes]
- movzx edx, byte [eax+2*ecx-1]
- movd xmm3, edx
+ movzx r2, byte [r0+2*r1-1]
+ movd xmm3, r2d
pmuludq xmm3, [mmx_01bytes]
- sub eax, ecx
- movd [eax], xmm0
- movd [eax+ecx], xmm1
- lea eax, [eax+2*ecx]
- movd [eax], xmm2
- movd [eax+ecx], xmm3
+ sub r0, r1
+ movd [r0], xmm0
+ movd [r0+r1], xmm1
+ lea r0, [r0+2*r1]
+ movd [r0], xmm2
+ movd [r0+r1], xmm3
ret
@@ -223,20 +227,28 @@
; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
WelsI16x16LumaPredPlane_sse2:
-%define pushsize 4
- push esi
- mov esi, [esp + pushsize + 4]
- mov ecx, [esp + pushsize + 8]
- sub esi, 1
- sub esi, ecx
+ ;%define pushsize 4
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r4, r0 ; save r0 in r4
+ ;push esi
+ ;mov esi, [esp + pushsize + 4]
+ ;mov ecx, [esp + pushsize + 8]
+ sub r0, 1
+ sub r0, r1
;for H
pxor xmm7, xmm7
- movq xmm0, [esi]
+ movq xmm0, [r0]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
- movq xmm1, [esi + 9]
+ movq xmm1, [r0 + 9]
movdqa xmm6, [sse2_plane_inc]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
@@ -243,25 +255,25 @@
psubw xmm1, xmm0
SUMW_HORIZON xmm1,xmm0,xmm2
- movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx eax, ax
- imul eax, 5
- add eax, 32
- sar eax, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
+ movd r2d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r2, r2w
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r2d ; xmm1 = b,b,b,b,b,b,b,b
- movzx edx, BYTE [esi+16]
- sub esi, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
+ movzx r3, BYTE [r0+16]
+ sub r0, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r0, r1
- add esi, 3
- movzx eax, BYTE [esi+8*ecx]
- add edx, eax
- shl edx, 4 ; a = (left[15*kiStride] + top[15]) << 4;
+ add r0, 3
+ movzx r2, BYTE [r0+8*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[15*kiStride] + top[15]) << 4;
- sub esi, 3
- add esi, ecx
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r0, r1
pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
@@ -270,21 +282,22 @@
psubw xmm7, xmm0
SUMW_HORIZON xmm7,xmm0,xmm2
- movd eax, xmm7 ; V
- movsx eax, ax
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
- imul eax, 5
- add eax, 32
- sar eax, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
+ imul r2, 5
+ add r2, 32
+ sar r2, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r2d ; xmm4 = c,c,c,c,c,c,c,c
- mov esi, [esp + pushsize + 4]
- add edx, 16
- imul eax, -7
- add edx, eax ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+ ;mov esi, [esp + pushsize + 4]
+ mov r0, r4
+ add r3, 16
+ imul r2, -7
+ add r3, r2 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor eax, eax
+ xor r2, r2
movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
@@ -297,14 +310,16 @@
paddw xmm3, xmm0
psraw xmm3, 5
packuswb xmm2, xmm3
- movdqa [esi], xmm2
+ movdqa [r0], xmm2
paddw xmm0, xmm4
- add esi, ecx
- inc eax
- cmp eax, 16
+ add r0, r1
+ inc r2
+ cmp r2, 16
jnz get_i16x16_luma_pred_plane_sse2_1
- pop esi
+ ;pop esi
+ pop r4
+ pop r3
ret
@@ -313,32 +328,37 @@
; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
;*******************************************************************************
-%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
- lea eax, [eax+ecx*2]
+%macro SSE2_PRED_H_16X16_TWO_LINE_DEC 2
+ lea %1, [%1+%2*2]
- COPY_16_TIMES eax, xmm0
- movdqa [eax], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [eax+ecx], xmm0
+ COPY_16_TIMES %1, xmm0
+ movdqa [%1], xmm0
+ COPY_16_TIMESS %1, xmm0, %2
+ movdqa [%1+%2], xmm0
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
WelsI16x16LumaPredH_sse2:
- mov eax, [esp+4] ; pPred
- mov ecx, [esp+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ ;mov eax, [esp+4] ; pPred
+ ;mov ecx, [esp+8] ; kiStride
- COPY_16_TIMES eax, xmm0
- movdqa [eax], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [eax+ecx], xmm0
+ COPY_16_TIMES r0, xmm0
+ movdqa [r0], xmm0
+ COPY_16_TIMESS r0, xmm0, r1
+ movdqa [r0+r1], xmm0
- SSE2_PRED_H_16X16_TWO_LINE_DEC
- SSE2_PRED_H_16X16_TWO_LINE_DEC
- SSE2_PRED_H_16X16_TWO_LINE_DEC
- SSE2_PRED_H_16X16_TWO_LINE_DEC
- SSE2_PRED_H_16X16_TWO_LINE_DEC
- SSE2_PRED_H_16X16_TWO_LINE_DEC
- SSE2_PRED_H_16X16_TWO_LINE_DEC
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
+ SSE2_PRED_H_16X16_TWO_LINE_DEC r0, r1
ret
@@ -347,36 +367,41 @@
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredV_sse2
WelsI16x16LumaPredV_sse2:
- mov edx, [esp+4] ; pPred
- mov ecx, [esp+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ ;mov edx, [esp+4] ; pPred
+ ;mov ecx, [esp+8] ; kiStride
- sub edx, ecx
- movdqa xmm0, [edx]
+ sub r0, r1
+ movdqa xmm0, [r0]
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- lea edx, [edx+2*ecx]
- movdqa [edx], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm0
+ lea r0, [r0+2*r1]
+ movdqa [r0], xmm0
ret
@@ -385,19 +410,27 @@
;*******************************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
WelsIChromaPredPlane_sse2:
-%define pushsize 4
- push esi
- mov esi, [esp + pushsize + 4] ;pPred
- mov ecx, [esp + pushsize + 8] ;kiStride
- sub esi, 1
- sub esi, ecx
+ ;%define pushsize 4
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r4, r0
+ ;push esi
+ ;mov esi, [esp + pushsize + 4] ;pPred
+ ;mov ecx, [esp + pushsize + 8] ;kiStride
+ sub r0, 1
+ sub r0, r1
pxor mm7, mm7
- movq mm0, [esi]
+ movq mm0, [r0]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
pmullw mm0, mm5
- movq mm1, [esi + 5]
+ movq mm1, [r0 + 5]
movq mm6, [sse2_plane_inc_c]
punpcklbw mm1, mm7
pmullw mm1, mm6
@@ -406,25 +439,25 @@
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
- movd eax, xmm1
- movsx eax, ax
- imul eax, 17
- add eax, 16
- sar eax, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
+ movd r2d, xmm1
+ movsx r2, r2w
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r2d ; mm1 = b,b,b,b,b,b,b,b
- movzx edx, BYTE [esi+8]
- sub esi, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
+ movzx r3, BYTE [r0+8]
+ sub r0, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r0, r1
- add esi, 3
- movzx eax, BYTE [esi+4*ecx]
- add edx, eax
- shl edx, 4 ; a = (left[7*kiStride] + top[7]) << 4;
+ add r0, 3
+ movzx r2, BYTE [r0+4*r1]
+ add r3, r2
+ shl r3, 4 ; a = (left[7*kiStride] + top[7]) << 4;
- sub esi, 3
- add esi, ecx
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
+ sub r0, 3
+ add r0, r1
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r0, r1
pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
@@ -435,21 +468,22 @@
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
- movd eax, xmm7 ; V
- movsx eax, ax
+ movd r2d, xmm7 ; V
+ movsx r2, r2w
- imul eax, 17
- add eax, 16
- sar eax, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
+ imul r2, 17
+ add r2, 16
+ sar r2, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r2d ; mm4 = c,c,c,c,c,c,c,c
- mov esi, [esp + pushsize + 4]
- add edx, 16
- imul eax, -3
- add edx, eax ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
+ ;mov esi, [esp + pushsize + 4]
+ mov r0, r4
+ add r3, 16
+ imul r2, -3
+ add r3, r2 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- xor eax, eax
+ xor r2, r2
movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
@@ -458,14 +492,16 @@
paddw xmm2, xmm0
psraw xmm2, 5
packuswb xmm2, xmm2
- movq [esi], xmm2
+ movq [r0], xmm2
paddw xmm0, xmm4
- add esi, ecx
- inc eax
- cmp eax, 8
+ add r0, r1
+ inc r2
+ cmp r2, 8
jnz get_i_chroma_pred_plane_sse2_1
- pop esi
+ ;pop esi
+ pop r4
+ pop r3
WELSEMMS
ret
@@ -483,27 +519,33 @@
;
;*******************************************************************************
WelsI4x4LumaPredDDR_mmx:
- mov edx,[esp+4] ;pPred
- mov eax,edx
- mov ecx,[esp+8] ;kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx,[esp+4] ;pPred
+ ;mov eax,edx
+ ;mov ecx,[esp+8] ;kiStride
- movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
- sub eax, ecx ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[eax-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[eax] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ movq mm1,[r2+r1-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r2-8] ;get value of 6 mm2[8] = 6
+ sub r2, r1 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r2-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r2] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
psllq mm3,18h ;mm3[5]=[1]
psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea eax,[eax+ecx*2-8h] ;set eax point to 12
- movq mm4,[eax+ecx] ;get value of 16, mm4[8]=[16]
+ lea r2,[r2+r1*2-8h] ;set eax point to 12
+ movq mm4,[r2+r1] ;get value of 16, mm4[8]=[16]
psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
psrlq mm4,38h ;mm4[1]=[16]
por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[eax+ecx*2] ;mm4[8]=[21]
+ movq mm4,[r2+r1*2] ;mm4[8]=[21]
psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
psrlq mm4,38h ;mm4[1]=[21]
por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
@@ -514,15 +556,15 @@
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
- lea edx,[edx+ecx]
- movd [edx+2*ecx],mm2
- sub edx,ecx
+ lea r0,[r0+r1]
+ movd [r0+2*r1],mm2
+ sub r0,r1
psrlq mm2,8
- movd [edx+2*ecx],mm2
+ movd [r0+2*r1],mm2
psrlq mm2,8
- movd [edx+ecx],mm2
+ movd [r0+r1],mm2
psrlq mm2,8
- movd [edx],mm2
+ movd [r0],mm2
WELSEMMS
ret
@@ -540,41 +582,52 @@
;
;*******************************************************************************
WelsI4x4LumaPredDc_sse2:
- mov eax,[esp+4] ;pPred
- mov ecx,[esp+8] ;kiStride
- push ebx
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r4, r0
+ ;mov eax,[esp+4] ;pPred
+ ;mov ecx,[esp+8] ;kiStride
+ ;push ebx
- movzx edx, byte [eax-1h]
+ movzx r2, byte [r0-1h]
- sub eax, ecx
- movd xmm0, [eax]
+ sub r0, r1
+ movd xmm0, [r0]
pxor xmm1, xmm1
psadbw xmm0, xmm1
- movd ebx, xmm0
- add ebx, edx
+ movd r3d, xmm0
+ add r3, r2
- movzx edx, byte [eax+ecx*2-1h]
- add ebx, edx
+ movzx r2, byte [r0+r1*2-1h]
+ add r3, r2
- lea eax, [eax+ecx*2-1]
- movzx edx, byte [eax+ecx]
- add ebx, edx
+ lea r0, [r0+r1*2-1]
+ movzx r2, byte [r0+r1]
+ add r3, r2
- movzx edx, byte [eax+ecx*2]
- add ebx, edx
- add ebx, 4
- sar ebx, 3
- imul ebx, 0x01010101
+ movzx r2, byte [r0+r1*2]
+ add r3, r2
+ add r3, 4
+ sar r3, 3
+ imul r3, 0x01010101
- mov edx, [esp+8] ;pPred
- mov [edx], ebx
- mov [edx+ecx], ebx
- mov [edx+2*ecx], ebx
- lea edx, [edx+2*ecx]
- mov [edx+ecx], ebx
+ ;mov edx, [esp+8] ;pPred
+ mov r0, r4
+ mov [r0], r3d
+ mov [r0+r1], r3d
+ mov [r0+2*r1], r3d
+ lea r0, [r0+2*r1]
+ mov [r0+r1], r3d
- pop ebx
+ ;pop ebx
+ pop r4
+ pop r3
ret
ALIGN 16
@@ -592,7 +645,7 @@
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+ecx-8]
+ movq %1, [%3+r1-8]
psrlq %1, 38h
pmullw %1, [mmx_01bytes]
@@ -602,60 +655,47 @@
WELS_EXTERN WelsIChromaPredH_mmx
WelsIChromaPredH_mmx:
- mov edx, [esp+4] ;pPred
- mov eax, edx
- mov ecx, [esp+8] ;kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx, [esp+4] ;pPred
+ ;mov eax, edx
+ ;mov ecx, [esp+8] ;kiStride
- movq mm0, [eax-8]
+ movq mm0, [r2-8]
psrlq mm0, 38h
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
- movq [edx], mm0
+ movq [r0], mm0
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea eax, [eax+ecx*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea edx, [edx+2*ecx]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea eax, [eax+ecx*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea edx, [edx+2*ecx]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
- lea eax, [eax+ecx*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax, edx+2*ecx
+ lea r2, [r2+r1*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r2, r0+2*r1
- lea edx, [edx+2*ecx]
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
+ lea r0, [r0+2*r1]
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r2, r0+r1
WELSEMMS
ret
-ALIGN 16
-;*******************************************************************************
-; void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
-; copy pixels from top 4 pixels
-;*******************************************************************************
-WELS_EXTERN get_i4x4_luma_pred_v_asm
-get_i4x4_luma_pred_v_asm:
- mov eax, [esp+4] ;pPred
- mov ecx, [esp+8] ;kiStride
- sub eax, ecx
- mov edx, [eax]
- mov [eax+ecx], edx
- mov [eax+2*ecx], edx
- lea eax, [eax+2*ecx]
- mov [eax+ecx], edx
- mov [eax+2*ecx], edx
-
- ret
-
ALIGN 16
;*******************************************************************************
; void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -663,23 +703,28 @@
;*******************************************************************************
WELS_EXTERN WelsIChromaPredV_mmx
WelsIChromaPredV_mmx:
- mov eax, [esp+4] ;pPred
- mov ecx, [esp+8] ;kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ ;mov eax, [esp+4] ;pPred
+ ;mov ecx, [esp+8] ;kiStride
- sub eax, ecx
- movq mm0, [eax]
+ sub r0, r1
+ movq mm0, [r0]
- movq [eax+ecx], mm0
- movq [eax+2*ecx], mm0
- lea eax, [eax+2*ecx]
- movq [eax+ecx], mm0
- movq [eax+2*ecx], mm0
- lea eax, [eax+2*ecx]
- movq [eax+ecx], mm0
- movq [eax+2*ecx], mm0
- lea eax, [eax+2*ecx]
- movq [eax+ecx], mm0
- movq [eax+2*ecx], mm0
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
+ lea r0, [r0+2*r1]
+ movq [r0+r1], mm0
+ movq [r0+2*r1], mm0
WELSEMMS
ret
@@ -717,18 +762,24 @@
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
WelsI4x4LumaPredHD_mmx:
- mov edx, [esp+4] ; pPred
- mov eax, edx
- mov ecx, [esp+8] ; kiStride
- sub eax, ecx
- movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx, [esp+4] ; pPred
+ ;mov eax, edx
+ ;mov ecx, [esp+8] ; kiStride
+ sub r2, r1
+ movd mm0, [r2-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
- lea eax, [eax+2*ecx]
- movd mm2, [eax+2*ecx-4]
- punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movd mm2, [r2+2*r1-4]
+ punpcklbw mm2, [r2+r1-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
@@ -758,14 +809,14 @@
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
- movd [edx], mm2
- lea edx, [edx+ecx]
- movd [edx+2*ecx], mm3
- sub edx, ecx
+ movd [r0], mm2
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm3
+ sub r0, r1
psrlq mm3, 10h
- movd [edx+2*ecx], mm3
+ movd [r0+2*r1], mm3
psrlq mm3, 10h
- movd [edx+ecx], mm3
+ movd [r0+r1], mm3
WELSEMMS
ret
@@ -800,15 +851,21 @@
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
WelsI4x4LumaPredHU_mmx:
- mov edx, [esp+4] ; pPred
- mov eax, edx
- mov ecx, [esp+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx, [esp+4] ; pPred
+ ;mov eax, edx
+ ;mov ecx, [esp+8] ; kiStride
- movd mm0, [eax-4] ; mm0[3] = l0
- punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
- lea eax, [eax+2*ecx]
- movd mm2, [eax-4] ; mm2[3] = l2
- movd mm4, [eax+ecx-4] ; mm4[3] = l3
+ movd mm0, [r2-4] ; mm0[3] = l0
+ punpcklbw mm0, [r2+r1-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r2, [r2+2*r1]
+ movd mm2, [r2-4] ; mm2[3] = l2
+ movd mm4, [r2+r1-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
@@ -841,15 +898,15 @@
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
psrlq mm4, 20h
- lea edx, [edx+ecx]
- movd [edx+2*ecx], mm4
+ lea r0, [r0+r1]
+ movd [r0+2*r1], mm4
- sub edx, ecx
- movd [edx], mm1
+ sub r0, r1
+ movd [r0], mm1
psrlq mm1, 10h
- movd [edx+ecx], mm1
+ movd [r0+r1], mm1
psrlq mm1, 10h
- movd [edx+2*ecx], mm1
+ movd [r0+2*r1], mm1
WELSEMMS
ret
@@ -886,17 +943,23 @@
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
WelsI4x4LumaPredVR_mmx:
- mov edx, [esp+4] ; pPred
- mov eax, edx
- mov ecx, [esp+8] ; kiStride
- sub eax, ecx
- movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx, [esp+4] ; pPred
+ ;mov eax, edx
+ ;mov ecx, [esp+8] ; kiStride
+ sub r2, r1
+ movq mm0, [r2-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
- lea eax, [eax+2*ecx]
- movq mm2, [eax+ecx-8] ; mm2[7] = l2
+ movd mm1, [r2+2*r1-4]
+ punpcklbw mm1, [r2+r1-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r2, [r2+2*r1]
+ movq mm2, [r2+r1-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
@@ -920,10 +983,10 @@
movq mm2, mm3
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [edx], mm1
+ movd [r0], mm1
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [edx+ecx], mm2
+ movd [r0+r1], mm2
movq mm4, mm3
psllq mm4, 20h
@@ -935,12 +998,12 @@
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [edx+2*ecx], mm4
+ movd [r0+2*r1], mm4
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- lea edx, [edx+2*ecx]
- movd [edx+ecx], mm5
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm5
WELSEMMS
ret
@@ -973,11 +1036,17 @@
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
WelsI4x4LumaPredDDL_mmx:
- mov edx, [esp+4] ; pPred
- mov eax, edx
- mov ecx, [esp+8] ; kiStride
- sub eax, ecx
- movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx, [esp+4] ; pPred
+ ;mov eax, edx
+ ;mov ecx, [esp+8] ; kiStride
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
@@ -998,14 +1067,14 @@
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
psrlq mm0, 8h
- movd [edx], mm0
+ movd [r0], mm0
psrlq mm0, 8h
- movd [edx+ecx], mm0
+ movd [r0+r1], mm0
psrlq mm0, 8h
- movd [edx+2*ecx], mm0
+ movd [r0+2*r1], mm0
psrlq mm0, 8h
- lea edx, [edx+2*ecx]
- movd [edx+ecx], mm0
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm0
WELSEMMS
ret
@@ -1042,12 +1111,18 @@
;*******************************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
WelsI4x4LumaPredVL_mmx:
- mov edx, [esp+4] ; pPred
- mov eax, edx
- mov ecx, [esp+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ ;mov edx, [esp+4] ; pPred
+ ;mov eax, edx
+ ;mov ecx, [esp+8] ; kiStride
- sub eax, ecx
- movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ sub r2, r1
+ movq mm0, [r2] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
@@ -1065,14 +1140,14 @@
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
- movd [edx], mm3
+ movd [r0], mm3
psrlq mm3, 8h
- movd [edx+2*ecx], mm3
+ movd [r0+2*r1], mm3
- movd [edx+ecx], mm2
+ movd [r0+r1], mm2
psrlq mm2, 8h
- lea edx, [edx+2*ecx]
- movd [edx+ecx], mm2
+ lea r0, [r0+2*r1]
+ movd [r0+r1], mm2
WELSEMMS
ret
@@ -1083,34 +1158,42 @@
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
WelsIChromaPredDc_sse2:
- push ebx
- mov eax, [esp+8] ; pPred
- mov ecx, [esp+12] ; kiStride
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r4, r0
+ ;push ebx
+ ;mov eax, [esp+8] ; pPred
+ ;mov ecx, [esp+12] ; kiStride
- sub eax, ecx
- movq mm0, [eax]
+ sub r0, r1
+ movq mm0, [r0]
- movzx ebx, byte [eax+ecx-0x01] ; l1
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l2
- add ebx, edx
- movzx edx, byte [eax+ecx-0x01] ; l3
- add ebx, edx
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l4
- add ebx, edx
- movd mm1, ebx ; mm1 = l1+l2+l3+l4
+ movzx r2, byte [r0+r1-0x01] ; l1
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l2
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l3
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l4
+ add r2, r3
+ movd mm1, r2d ; mm1 = l1+l2+l3+l4
- movzx ebx, byte [eax+ecx-0x01] ; l5
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l6
- add ebx, edx
- movzx edx, byte [eax+ecx-0x01] ; l7
- add ebx, edx
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l8
- add ebx, edx
- movd mm2, ebx ; mm2 = l5+l6+l7+l8
+ movzx r2, byte [r0+r1-0x01] ; l5
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l6
+ add r2, r3
+ movzx r3, byte [r0+r1-0x01] ; l7
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0-0x01] ; l8
+ add r2, r3
+ movd mm2, r2d ; mm2 = l5+l6+l7+l8
movq mm3, mm0
psrlq mm0, 0x20
@@ -1150,22 +1233,24 @@
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
- mov edx, [esp+8] ; pPred
+ ;mov edx, [esp+8] ; pPred
- movq [edx], mm0
- movq [edx+ecx], mm0
- movq [edx+2*ecx], mm0
- lea edx, [edx+2*ecx]
- movq [edx+ecx], mm0
+ movq [r4], mm0
+ movq [r4+r1], mm0
+ movq [r4+2*r1], mm0
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm0
- movq [edx+2*ecx], mm1
- lea edx, [edx+2*ecx]
- movq [edx+ecx], mm1
- movq [edx+2*ecx], mm1
- lea edx, [edx+2*ecx]
- movq [edx+ecx], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm1
+ lea r4, [r4+2*r1]
+ movq [r4+r1], mm1
- pop ebx
+ ;pop ebx
+ pop r4
+ pop r3
WELSEMMS
ret
@@ -1178,12 +1263,19 @@
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
WelsI16x16LumaPredDc_sse2:
- push ebx
- mov eax, [esp+8] ; pPred
- mov ecx, [esp+12] ; kiStride
-
- sub eax, ecx
- movdqa xmm0, [eax] ; read one row
+ ;push ebx
+ ;mov eax, [esp+8] ; pPred
+ ;mov ecx, [esp+12] ; kiStride
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r4, r0
+ sub r0, r1
+ movdqa xmm0, [r0] ; read one row
pxor xmm1, xmm1
psadbw xmm0, xmm1
movdqa xmm1, xmm0
@@ -1192,10 +1284,10 @@
psrldq xmm0, 0x08
paddw xmm0, xmm1
- movzx ebx, byte [eax+ecx-0x01]
- movzx edx, byte [eax+2*ecx-0x01]
- add ebx, edx
- lea eax, [eax+ecx]
+ movzx r2, byte [r0+r1-0x01]
+ movzx r3, byte [r0+2*r1-0x01]
+ add r2, r3
+ lea r0, [r0+r1]
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
@@ -1203,47 +1295,49 @@
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
- add ebx, 0x10
- movd xmm1, ebx
+ add r2, 0x10
+ movd xmm1, r2d
paddw xmm0, xmm1
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
- mov edx, [esp+8] ; pPred
+ ;mov edx, [esp+8] ; pPred
- movdqa [edx], xmm0
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4], xmm0
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
- movdqa [edx+2*ecx], xmm0
- lea edx, [edx+2*ecx]
+ movdqa [r4+r1], xmm0
+ movdqa [r4+2*r1], xmm0
+ lea r4, [r4+2*r1]
- movdqa [edx+ecx], xmm0
+ movdqa [r4+r1], xmm0
- pop ebx
+ ;pop ebx
+ pop r4
+ pop r3
ret
@@ -1257,68 +1351,78 @@
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
WelsI16x16LumaPredDcTop_sse2:
- push ebx
-
- %define PUSH_SIZE 4
-
- mov eax, [esp+PUSH_SIZE+4] ; pPred
- mov ebx, [esp+PUSH_SIZE+8] ; kiStride
-
- mov ecx, ebx
- neg ecx
- movdqa xmm0, [eax+ecx] ; pPred-kiStride, top line
+ ;push ebx
+ ;%define PUSH_SIZE 4
+ ;mov eax, [esp+PUSH_SIZE+4] ; pPred
+ ;mov ebx, [esp+PUSH_SIZE+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ sub r2, r1
+ movdqa xmm0, [r2] ; pPred-kiStride, top line
pxor xmm7, xmm7
+ psadbw xmm0, xmm7
movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
+ psrldq xmm1, 8
+ paddw xmm0, xmm1
+ xor r2, r2
+ movd r2d, xmm0
+ ;movdqa xmm1, xmm0
+ ;punpcklbw xmm0, xmm7
+ ;punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
- pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
- paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
- pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
- paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
- pshuflw xmm1, xmm0, 0b1h ; 10110001
- paddw xmm0, xmm1 ; sum in word unit (x8)
- movd edx, xmm0
- and edx, 0ffffh
+ ;paddw xmm0, xmm1 ; (ub.max(ff) << 4) will not excceed of uw, so can perform it in unit of unsigned word scope
+ ;pshufd xmm1, xmm0, 04eh ; 01001110, w3w2w1w0,w7w6w5w4
+ ;paddw xmm0, xmm1 ; w3+7 w2+6 w1+5 w0+4 w3+7 w2+6 w1+5 w0+4
+ ;pshufd xmm1, xmm0, 0b1h ; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
+ ;paddw xmm0, xmm1 ; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
+ ;pshuflw xmm1, xmm0, 0b1h ; 10110001
+ ;paddw xmm0, xmm1 ; sum in word unit (x8)
+ ;xor r3, r3
+ ;movd r3d, xmm0
+ ;and edx, 0ffffh
- add edx, 08h
- sar edx, 04h
- mov dh, dl
- mov ecx, edx
- shl ecx, 010h
- or edx, ecx
- movd xmm1, edx
- pshufd xmm0, xmm1, 00h
- movdqa xmm1, xmm0
+ add r2, 8
+ sar r2, 4
+ SSE2_Copy16Times xmm1, r2d
+ ;mov dh, dl
+ ;mov r2, edx
+ ;shl r2, 010h
+ ;or edx, r2
+ ;movd xmm1, edx
+ ;pshufd xmm0, xmm1, 00h
+ ;movdqa xmm1, xmm0
+ movdqa xmm0, xmm1
+ lea r2, [2*r1+r1] ; 3*kiStride
- lea ecx, [2*ebx+ebx] ; 3*kiStride
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea eax, [eax+4*ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea eax, [eax+4*ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- lea eax, [eax+4*ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
-
- %undef PUSH_SIZE
- pop ebx
+ ;%undef PUSH_SIZE
+ ;pop ebx
ret
ALIGN 16
@@ -1327,40 +1431,44 @@
;*******************************************************************************
WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
WelsI16x16LumaPredDcNA_sse2:
- push ebx
+ ;push ebx
- %define PUSH_SIZE 4
+ ;%define PUSH_SIZE 4
- mov eax, [esp+PUSH_SIZE+4] ; pPred
- mov ebx, [esp+PUSH_SIZE+8] ; kiStride
+ ;mov eax, [esp+PUSH_SIZE+4] ; pPred
+ ;mov ebx, [esp+PUSH_SIZE+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ lea r2, [2*r1+r1] ; 3*kiStride
- lea ecx, [2*ebx+ebx] ; 3*kiStride
-
movdqa xmm0, [sse2_dc_0x80]
movdqa xmm1, xmm0
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
- lea eax, [eax+4*ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
- lea eax, [eax+4*ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
- lea eax, [eax+4*ebx]
- movdqa [eax], xmm0
- movdqa [eax+ebx], xmm1
- movdqa [eax+2*ebx], xmm0
- movdqa [eax+ecx], xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
+ lea r0, [r0+4*r1]
+ movdqa [r0], xmm0
+ movdqa [r0+r1], xmm1
+ movdqa [r0+2*r1], xmm0
+ movdqa [r0+r2], xmm1
- %undef PUSH_SIZE
+ ;%undef PUSH_SIZE
- pop ebx
+ ;pop ebx
ret
ALIGN 16
@@ -1369,58 +1477,80 @@
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDcLeft_mmx
WelsIChromaPredDcLeft_mmx:
- push ebx
- push esi
- %define PUSH_SIZE 8
- mov esi, [esp+PUSH_SIZE+4] ; pPred
- mov ecx, [esp+PUSH_SIZE+8] ; kiStride
- mov eax, esi
+ ;push ebx
+ ;push esi
+ ;%define PUSH_SIZE 8
+ ;mov esi, [esp+PUSH_SIZE+4] ; pPred
+ ;mov ecx, [esp+PUSH_SIZE+8] ; kiStride
+ ;mov eax, esi
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r4, r0
; for left
- dec eax
- xor ebx, ebx
- xor edx, edx
- mov bl, [eax]
- mov dl, [eax+ecx]
- add ebx, edx
- lea eax, [eax+2*ecx]
- mov dl, [eax]
- add ebx, edx
- mov dl, [eax+ecx]
- add ebx, edx
- add ebx, 02h
- sar ebx, 02h
- mov bh, bl
- movd mm1, ebx
- pshufw mm0, mm1, 00h ; up64
+ dec r0
+ xor r2, r2
+ xor r3, r3
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ ;SSE2_Copy16Times mm0, r2d
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm1, r2d
+ pshufw mm0, mm1, 00h
+ ;mov bh, bl
+ ;movd mm1, ebx
+ ;pshufw mm0, mm1, 00h ; up64
movq mm1, mm0
- xor ebx, ebx
- lea eax, [eax+2*ecx]
- mov bl, [eax]
- mov dl, [eax+ecx]
- add ebx, edx
- lea eax, [eax+2*ecx]
- mov dl, [eax]
- add ebx, edx
- mov dl, [eax+ecx]
- add ebx, edx
- add ebx, 02h
- sar ebx, 02h
- mov bh, bl
- movd mm3, ebx
- pshufw mm2, mm3, 00h ; down64
+ xor r2, r2
+ lea r0, [r0+2*r1]
+ movzx r2, byte [r0]
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ lea r0, [r0+2*r1]
+ movzx r3, byte [r0]
+ add r2, r3
+ movzx r3, byte [r0+r1]
+ add r2, r3
+ add r2, 02h
+ sar r2, 02h
+ mov r3, r2
+ sal r3, 8
+ or r2, r3
+ movd mm3, r2d
+ pshufw mm2, mm3, 00h
+ ;mov bh, bl
+ ;movd mm3, ebx
+ ;pshufw mm2, mm3, 00h ; down64
+ ;SSE2_Copy16Times mm2, r2d
movq mm3, mm2
- lea ebx, [2*ecx+ecx]
- movq [esi], mm0
- movq [esi+ecx], mm1
- movq [esi+2*ecx], mm0
- movq [esi+ebx], mm1
- lea esi, [esi+4*ecx]
- movq [esi], mm2
- movq [esi+ecx], mm3
- movq [esi+2*ecx], mm2
- movq [esi+ebx], mm3
- pop esi
- pop ebx
+ lea r2, [2*r1+r1]
+ movq [r4], mm0
+ movq [r4+r1], mm1
+ movq [r4+2*r1], mm0
+ movq [r4+r2], mm1
+ lea r4, [r4+4*r1]
+ movq [r4], mm2
+ movq [r4+r1], mm3
+ movq [r4+2*r1], mm2
+ movq [r4+r2], mm3
+ ;pop esi
+ ;pop ebx
+ pop r4
+ pop r3
emms
ret
@@ -1430,13 +1560,20 @@
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDcTop_sse2
WelsIChromaPredDcTop_sse2:
- push ebx
- %define PUSH_SIZE 4
- mov eax, [esp+PUSH_SIZE+4] ; pPred
- mov ecx, [esp+PUSH_SIZE+8] ; kiStride
- mov ebx, ecx
- neg ebx
- movq xmm0, [eax+ebx] ; top: 8x1 pixels
+ ;push ebx
+ ;%define PUSH_SIZE 4
+ ;mov eax, [esp+PUSH_SIZE+4] ; pPred
+ ;mov ecx, [esp+PUSH_SIZE+8] ; kiStride
+ ;mov ebx, ecx
+ ;neg ebx
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ mov r2, r0
+ sub r2, r1
+ movq xmm0, [r2] ; top: 8x1 pixels
pxor xmm7, xmm7
punpcklbw xmm0, xmm7 ; ext 8x2 words
pshufd xmm1, xmm0, 0B1h ; 10110001 B, w5 w4 w7 w6 w1 w0 w3 w2
@@ -1452,21 +1589,20 @@
paddw xmm0, xmm6
psraw xmm0, 02h
packuswb xmm0, xmm7
- lea ebx, [2*ecx+ecx]
- movq [eax], xmm0
- movq [eax+ecx], xmm0
- movq [eax+2*ecx], xmm0
- movq [eax+ebx], xmm0
- lea eax, [eax+4*ecx]
- movq [eax], xmm0
- movq [eax+ecx], xmm0
- movq [eax+2*ecx], xmm0
- movq [eax+ebx], xmm0
- %undef PUSH_SIZE
- pop ebx
+ lea r2, [2*r1+r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ lea r0, [r0+4*r1]
+ movq [r0], xmm0
+ movq [r0+r1], xmm0
+ movq [r0+2*r1], xmm0
+ movq [r0+r2], xmm0
+ ;%undef PUSH_SIZE
+ ;pop ebx
ret
-
ALIGN 16
;*******************************************************************************
; void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1473,26 +1609,29 @@
;*******************************************************************************
WELS_EXTERN WelsIChromaPredDcNA_mmx
WelsIChromaPredDcNA_mmx:
- push ebx
- %define PUSH_SIZE 4
- mov eax, [esp+PUSH_SIZE+4] ; pPred
- mov ebx, [esp+PUSH_SIZE+8] ; kiStride
- lea ecx, [2*ebx+ebx]
+ ;push ebx
+ ;%define PUSH_SIZE 4
+ ;mov eax, [esp+PUSH_SIZE+4] ; pPred
+ ;mov ebx, [esp+PUSH_SIZE+8] ; kiStride
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ %endif
+ lea r2, [2*r1+r1]
movq mm0, [sse2_dc_0x80]
movq mm1, mm0
- movq [eax], mm0
- movq [eax+ebx], mm1
- movq [eax+2*ebx], mm0
- movq [eax+ecx], mm1
- lea eax, [eax+4*ebx]
- movq [eax], mm0
- movq [eax+ebx], mm1
- movq [eax+2*ebx], mm0
- movq [eax+ecx], mm1
- %undef PUSH_SIZE
- pop ebx
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ lea r0, [r0+4*r1]
+ movq [r0], mm0
+ movq [r0+r1], mm1
+ movq [r0+2*r1], mm0
+ movq [r0+r2], mm1
+ ;%undef PUSH_SIZE
+ ;pop ebx
emms
ret
-
-
--- a/codec/decoder/core/asm/mb_copy.asm
+++ /dev/null
@@ -1,330 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mb_copy.asm
-;*
-;* Abstract
-;* mb_copy and mb_copy1
-;*
-;* History
-;* 15/09/2009 Created
-;* 12/28/2009 Modified with larger throughput
-;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
-;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-;SECTION .rodata data align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN PixelAvgWidthEq4_mmx
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq4_mmx( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq4_mmx:
-
- push esi
- push edi
- push ebp
- push ebx
-
- mov edi, [esp+20] ; pDst
- mov eax, [esp+24] ; iDstStride
- mov esi, [esp+28] ; pSrcA
- mov ecx, [esp+32] ; iSrcAStride
- mov ebp, [esp+36] ; pSrcB
- mov edx, [esp+40] ; iSrcBStride
- mov ebx, [esp+44] ; iHeight
-ALIGN 4
-.height_loop:
- movd mm0, [ebp]
- pavgb mm0, [esi]
- movd [edi], mm0
-
- dec ebx
- lea edi, [edi+eax]
- lea esi, [esi+ecx]
- lea ebp, [ebp+edx]
- jne .height_loop
-
- WELSEMMS
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq8_mmx:
-
- push esi
- push edi
- push ebp
- push ebx
-
- mov edi, [esp+20] ; pDst
- mov eax, [esp+24] ; iDstStride
- mov esi, [esp+28] ; pSrcA
- mov ecx, [esp+32] ; iSrcAStride
- mov ebp, [esp+36] ; pSrcB
- mov edx, [esp+40] ; iSrcBStride
- mov ebx, [esp+44] ; iHeight
-ALIGN 4
-.height_loop:
- movq mm0, [esi]
- pavgb mm0, [ebp]
- movq [edi], mm0
- movq mm0, [esi+ecx]
- pavgb mm0, [ebp+edx]
- movq [edi+eax], mm0
-
- lea esi, [esi+2*ecx]
- lea ebp, [ebp+2*edx]
- lea edi, [edi+2*eax]
-
- sub ebx, 2
- jnz .height_loop
-
- WELSEMMS
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret
-
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t PixelAvgWidthEq16_sse2( uint8_t *pDst, int iDstStride,
-; uint8_t *pSrcA, int iSrcAStride,
-; uint8_t *pSrcB, int iSrcBStride,
-; int iHeight );
-;*******************************************************************************
-PixelAvgWidthEq16_sse2:
- push esi
- push edi
- push ebp
- push ebx
-
-
- mov edi, [esp+20] ; pDst
- mov eax, [esp+24] ; iDstStride
- mov esi, [esp+28] ; pSrcA
- mov ecx, [esp+32] ; iSrcAStride
- mov ebp, [esp+36] ; pSrcB
- mov edx, [esp+40] ; iSrcBStride
- mov ebx, [esp+44] ; iHeight
-ALIGN 4
-.height_loop:
- movdqu xmm0, [esi]
- pavgb xmm0, [ebp]
- movdqu [edi], xmm0
-
- movdqu xmm0, [esi+ecx]
- pavgb xmm0, [ebp+edx]
- movdqu [edi+eax], xmm0
-
- movdqu xmm0, [esi+2*ecx]
- pavgb xmm0, [ebp+2*edx]
- movdqu [edi+2*eax], xmm0
-
- lea esi, [esi+2*ecx]
- lea ebp, [ebp+2*edx]
- lea edi, [edi+2*eax]
-
- movdqu xmm0, [esi+ecx]
- pavgb xmm0, [ebp+edx]
- movdqu [edi+eax], xmm0
-
- lea esi, [esi+2*ecx]
- lea ebp, [ebp+2*edx]
- lea edi, [edi+2*eax]
-
-
- sub ebx, 4
- jne .height_loop
-
- WELSEMMS
- pop ebx
- pop ebp
- pop edi
- pop esi
-
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McCopyWidthEq4_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
-
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov ecx, [esp+28]
- mov edx, [esp+32]
-ALIGN 4
-.height_loop:
- mov ebx, [esi]
- mov [edi], ebx
-
- add esi, eax
- add edi, ecx
- dec edx
- jnz .height_loop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McCopyWidthEq8_mmx( uint8_t *pSrc, int iSrcStride,
-; uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-McCopyWidthEq8_mmx:
- push esi
- push edi
- mov esi, [esp+12]
- mov eax, [esp+16]
- mov edi, [esp+20]
- mov ecx, [esp+24]
- mov edx, [esp+28]
-
-ALIGN 4
-.height_loop:
- movq mm0, [esi]
- movq [edi], mm0
- add esi, eax
- add edi, ecx
- dec edx
- jnz .height_loop
-
- WELSEMMS
- pop edi
- pop esi
- ret
-
-
-
-
-
-
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
-;*******************************************************************************
-;read unaligned memory
-%macro SSE_READ_UNA 2
- movq %1, [%2]
- movhps %1, [%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE_WRITE_UNA 2
- movq [%1], %2
- movhps [%1+8], %2
-%endmacro
-McCopyWidthEq16_sse2:
- push esi
- push edi
-
- mov esi, [esp+12] ; pSrc
- mov eax, [esp+16] ; iSrcStride
- mov edi, [esp+20] ; pDst
- mov edx, [esp+24] ; iDstStride
- mov ecx, [esp+28] ; iHeight
-
-ALIGN 4
-.height_loop:
- SSE_READ_UNA xmm0, esi
- SSE_READ_UNA xmm1, esi+eax
- SSE_WRITE_UNA edi, xmm0
- SSE_WRITE_UNA edi+edx, xmm1
-
- sub ecx, 2
- lea esi, [esi+eax*2]
- lea edi, [edi+edx*2]
- jnz .height_loop
-
- pop edi
- pop esi
- ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ /dev/null
@@ -1,317 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd mm3, [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movd mm0, [esi]
- movd mm1, [esi+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [ebx]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [ebx+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [edi], mm0
-
- movq mm0, mm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd xmm3, [eax]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movq xmm0, [esi]
- movq xmm1, [esi+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [ebx]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [ebx+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movdqa xmm0, xmm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- push ebx
- push esi
- push edi
-
- mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- sub esi, edi
- sub esi, edi
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [eax]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea esi, [esi+2*edi]
-
- movdqu xmm2, [eax+edx]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [esi],xmm0
-
- lea eax, [eax+2*edx]
- movdqu xmm2, [eax]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
- sub ecx, 2
- jnz .hloop_chroma
- pop edi
- pop esi
- pop ebx
-
- ret
-
-
--- a/codec/decoder/core/asm/mc_luma.asm
+++ /dev/null
@@ -1,615 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_luma.asm
-;*
-;* Abstract
-;* sse2 motion compensation
-;*
-;* History
-;* 17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-SECTION .rodata align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10:
- dw 16, 16, 16, 16
-
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20WidthEq4_mmx
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight)
-;*******************************************************************************
-McHorVer20WidthEq4_mmx:
- push esi
- push edi
-
- mov esi, [esp+12]
- mov eax, [esp+16]
- mov edi, [esp+20]
- mov ecx, [esp+24]
- mov edx, [esp+28]
- sub esi, 2
- WELS_Zero mm7
- movq mm6, [h264_w0x10]
-.height_loop:
- movd mm0, [esi]
- punpcklbw mm0, mm7
- movd mm1, [esi+5]
- punpcklbw mm1, mm7
- movd mm2, [esi+1]
- punpcklbw mm2, mm7
- movd mm3, [esi+4]
- punpcklbw mm3, mm7
- movd mm4, [esi+2]
- punpcklbw mm4, mm7
- movd mm5, [esi+3]
- punpcklbw mm5, mm7
-
- paddw mm2, mm3
- paddw mm4, mm5
- psllw mm4, 2
- psubw mm4, mm2
- paddw mm0, mm1
- paddw mm0, mm4
- psllw mm4, 2
- paddw mm0, mm4
- paddw mm0, mm6
- psraw mm0, 5
- packuswb mm0, mm7
- movd [edi], mm0
-
- add esi, eax
- add edi, ecx
- dec edx
- jnz .height_loop
-
- WELSEMMS
- pop edi
- pop esi
- ret
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-
-%macro SSE_LOAD_8P 3
- movq %1, %3
- punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
- paddw %1, %6
- movdqa %8, %3
- movdqa %7, %2
- paddw %1, [h264_w0x10_1]
- paddw %8, %4
- paddw %7, %5
- psllw %8, 2
- psubw %8, %7
- paddw %1, %8
- psllw %8, 2
- paddw %1, %8
- psraw %1, 5
- WELS_Zero %8
- packuswb %1, %8
- movq %9, %1
-%endmacro
-
-;*******************************************************************************
-; Local Data (Read Only)
-;*******************************************************************************
-
-SECTION .rodata align=16
-
-;*******************************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;*******************************************************************************
-
-ALIGN 16
-h264_w0x10_1:
- dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
-dw 32, 32, 32, 32, 32, 32, 32, 32
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer22VerLast_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-
-ALIGN 16
-;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
-; int16_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride
-; int32_t iHeight
-; )
-;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
- push esi
- push edi
- push ebx
- mov esi, [esp+16] ;pSrc
- mov eax, [esp+20] ;iSrcStride
- mov edi, [esp+24] ;pDst
- mov edx, [esp+28] ;iDstStride
- mov ebx, [esp+32] ;iHeight
- pxor xmm7, xmm7
-
- sub esi, eax ;;;;;;;;need more 5 lines.
- sub esi, eax
-
-.yloop_width_8:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [edi], xmm0
-
- add esi, eax
- add edi, edx
- dec ebx
- jnz .yloop_width_8
- pop ebx
- pop edi
- pop esi
- ret
-
-ALIGN 16
-;***********************************************************************
-;void_t McHorVer22VerLast_sse2(
-; uint8_t *pSrc,
-; int32_t pSrcStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
-%macro FILTER_VER 9
- paddw %1, %6
- movdqa %7, %2
- movdqa %8, %3
-
-
- paddw %7, %5
- paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
- paddw %8, [h264_mc_hc_32]
- psraw %8, 6
- packuswb %8, %8
- movq %9, %8
-%endmacro
-
-McHorVer22VerLast_sse2:
- push esi
- push edi
- push ebx
- push ebp
-
- mov esi, [esp+20]
- mov eax, [esp+24]
- mov edi, [esp+28]
- mov edx, [esp+32]
- mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
-.width_loop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+eax]
- lea esi, [esi+2*eax]
- movdqa xmm2, [esi]
- movdqa xmm3, [esi+eax]
- lea esi, [esi+2*eax]
- movdqa xmm4, [esi]
- movdqa xmm5, [esi+eax]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- lea esi, [esi+2*eax]
- movdqa xmm6, [esi]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add edi, edx
- sub esi, eax
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm6, [esi]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm7, [esi+eax]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm0, [esi]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm1, [esi+eax]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm2, [esi]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm3, [esi+eax]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm4, [esi]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm5, [esi+eax]
- jmp near .start
-
-.x_loop_dec:
- dec ebx
- jz near .exit
- mov esi, [esp+20]
- mov edi, [esp+28]
- mov ecx, [esp+40]
- add esi, 16
- add edi, 8
- jmp .width_loop
-
-
-
-.exit:
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
-; );
-;*******************************************************************************
-McHorVer20WidthEq8_sse2:
- push esi
- push edi
-
- mov esi, [esp + 12] ;pSrc
- mov eax, [esp + 16] ;iSrcStride
- mov edi, [esp + 20] ;pDst
- mov ecx, [esp + 28] ;iHeight
- mov edx, [esp + 24] ;iDstStride
-
- lea esi, [esi-2] ;pSrc -= 2;
-
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
-
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .y_loop
-
- pop edi
- pop esi
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight,
-; );
-;*******************************************************************************
-McHorVer20WidthEq16_sse2:
- push esi
- push edi
-
-
- mov esi, [esp + 12] ;pSrc
- mov eax, [esp + 16] ;iSrcStride
- mov edi, [esp + 20] ;pDst
- mov ecx, [esp + 28] ;iHeight
- mov edx, [esp + 24] ;iDstStride
-
- lea esi, [esi-2] ;pSrc -= 2;
-
- pxor xmm7, xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movq xmm0, [esi+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3+8]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [edi+8], xmm0
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .y_loop
- pop edi
- pop esi
- ret
-
-
-;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int iSrcStride,
-; uint8_t *pDst,
-; int iDstStride,
-; int iHeight )
-;*******************************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
- push esi
- push edi
-
- mov esi, [esp + 12] ;pSrc
- mov edx, [esp + 16] ;iSrcStride
- mov edi, [esp + 20] ;pDst
- mov eax, [esp + 24] ;iDstStride
- mov ecx, [esp + 28] ;iHeight
-
- sub esi, edx
- sub esi, edx
-
- WELS_Zero xmm7
-
- SSE_LOAD_8P xmm0, xmm7, [esi]
- SSE_LOAD_8P xmm1, xmm7, [esi+edx]
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm2, xmm7, [esi]
- SSE_LOAD_8P xmm3, xmm7, [esi+edx]
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm4, xmm7, [esi]
- SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm6, xmm7, [esi]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm7, xmm0, [esi+edx]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm0, xmm1, [esi]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm1, xmm2, [esi+edx]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm2, xmm3, [esi]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm3, xmm4, [esi+edx]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm4, xmm5, [esi]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm5, xmm6, [esi+edx]
- jmp near .start
-
-.xx_exit:
- pop edi
- pop esi
- ret
-
-
--- a/codec/decoder/core/asm/memzero.asm
+++ /dev/null
@@ -1,135 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* memzero.asm
-;*
-;* Abstract
-;*
-;*
-;* History
-;* 9/16/2009 Created
-;*
-;*
-;*************************************************************************/
-
-BITS 32
-
-%include "asm_inc.asm"
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-ALIGN 16
-;***********************************************************************
-;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
-;***********************************************************************
-WELS_EXTERN WelsPrefetchZero_mmx
-WelsPrefetchZero_mmx:
- mov eax,[esp+4]
- prefetchnta [eax]
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; void WelsSetMemZeroAligned64_sse2(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroAligned64_sse2
-WelsSetMemZeroAligned64_sse2:
- mov eax, [esp + 4] ; dst
- mov ecx, [esp + 8]
- neg ecx
-
- pxor xmm0, xmm0
-.memzeroa64_sse2_loops:
- movdqa [eax], xmm0
- movdqa [eax+16], xmm0
- movdqa [eax+32], xmm0
- movdqa [eax+48], xmm0
- add eax, 0x40
-
- add ecx, 0x40
- jnz near .memzeroa64_sse2_loops
-
- ret
-
-ALIGN 16
-;***********************************************************************
-; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize64_mmx
-WelsSetMemZeroSize64_mmx:
- mov eax, [esp + 4] ; dst
- mov ecx, [esp + 8]
- neg ecx
-
- pxor mm0, mm0
-.memzero64_mmx_loops:
- movq [eax], mm0
- movq [eax+8], mm0
- movq [eax+16], mm0
- movq [eax+24], mm0
- movq [eax+32], mm0
- movq [eax+40], mm0
- movq [eax+48], mm0
- movq [eax+56], mm0
- add eax, 0x40
-
- add ecx, 0x40
- jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************
-; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
-;***********************************************************************
-WELS_EXTERN WelsSetMemZeroSize8_mmx
-WelsSetMemZeroSize8_mmx:
- mov eax, [esp + 4] ; dst
- mov ecx, [esp + 8] ; size
- neg ecx
- pxor mm0, mm0
-
-.memzero8_mmx_loops:
- movq [eax], mm0
- add eax, 0x08
-
- add ecx, 0x08
- jnz near .memzero8_mmx_loops
-
- WELSEMMS
- ret
-
-
--- a/codec/decoder/core/inc/mc.h
+++ b/codec/decoder/core/inc/mc.h
@@ -78,7 +78,7 @@
int32_t iHeight);
extern void_t McHorVer22Width8HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iHeight);
-extern void_t McHorVer22VerLast_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+extern void_t McHorVer22Width8VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
extern void_t PixelAvgWidthEq16_sse2 (uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, int32_t iSrcAStride,
uint8_t* pSrcB, int32_t iSrcBStride, int32_t iHeight);
--- a/codec/decoder/core/src/mc.cpp
+++ b/codec/decoder/core/src/mc.cpp
@@ -362,7 +362,7 @@
int32_t iHeight) {
ENFORCE_STACK_ALIGN_2D (int16_t, iTap, 21, 8, 16)
McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)iTap, 16, iHeight + 5);
- McHorVer22VerLast_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
+ McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)iTap, 16, pDst, iDstStride, 8, iHeight);
}
static inline void_t McHorVer02WidthEq16_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
--- a/codec/decoder/targets.mk
+++ b/codec/decoder/targets.mk
@@ -28,17 +28,9 @@
DECODER_OBJS += $(DECODER_CPP_SRCS:.cpp=.o)
ifeq ($(USE_ASM), Yes)
DECODER_ASM_SRCS=\
- $(DECODER_SRCDIR)/./core/asm/asm_inc.asm\
$(DECODER_SRCDIR)/./core/asm/block_add.asm\
- $(DECODER_SRCDIR)/./core/asm/cpuid.asm\
$(DECODER_SRCDIR)/./core/asm/dct.asm\
- $(DECODER_SRCDIR)/./core/asm/deblock.asm\
- $(DECODER_SRCDIR)/./core/asm/expand_picture.asm\
$(DECODER_SRCDIR)/./core/asm/intra_pred.asm\
- $(DECODER_SRCDIR)/./core/asm/mb_copy.asm\
- $(DECODER_SRCDIR)/./core/asm/mc_chroma.asm\
- $(DECODER_SRCDIR)/./core/asm/mc_luma.asm\
- $(DECODER_SRCDIR)/./core/asm/memzero.asm\
DECODER_OBJS += $(DECODER_ASM_SRCS:.asm=.o)
endif
@@ -113,38 +105,14 @@
$(DECODER_SRCDIR)/./plus/src/welsDecoderExt.o: $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.cpp
$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(DECODER_CFLAGS) $(DECODER_INCLUDES) -c -o $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.o $(DECODER_SRCDIR)/./plus/src/welsDecoderExt.cpp
-$(DECODER_SRCDIR)/./core/asm/asm_inc.o: $(DECODER_SRCDIR)/./core/asm/asm_inc.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/asm_inc.o $(DECODER_SRCDIR)/./core/asm/asm_inc.asm
-
$(DECODER_SRCDIR)/./core/asm/block_add.o: $(DECODER_SRCDIR)/./core/asm/block_add.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/block_add.o $(DECODER_SRCDIR)/./core/asm/block_add.asm
-$(DECODER_SRCDIR)/./core/asm/cpuid.o: $(DECODER_SRCDIR)/./core/asm/cpuid.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/cpuid.o $(DECODER_SRCDIR)/./core/asm/cpuid.asm
-
$(DECODER_SRCDIR)/./core/asm/dct.o: $(DECODER_SRCDIR)/./core/asm/dct.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/dct.o $(DECODER_SRCDIR)/./core/asm/dct.asm
-$(DECODER_SRCDIR)/./core/asm/deblock.o: $(DECODER_SRCDIR)/./core/asm/deblock.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/deblock.o $(DECODER_SRCDIR)/./core/asm/deblock.asm
-
-$(DECODER_SRCDIR)/./core/asm/expand_picture.o: $(DECODER_SRCDIR)/./core/asm/expand_picture.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/expand_picture.o $(DECODER_SRCDIR)/./core/asm/expand_picture.asm
-
$(DECODER_SRCDIR)/./core/asm/intra_pred.o: $(DECODER_SRCDIR)/./core/asm/intra_pred.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/intra_pred.o $(DECODER_SRCDIR)/./core/asm/intra_pred.asm
-
-$(DECODER_SRCDIR)/./core/asm/mb_copy.o: $(DECODER_SRCDIR)/./core/asm/mb_copy.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/mb_copy.o $(DECODER_SRCDIR)/./core/asm/mb_copy.asm
-
-$(DECODER_SRCDIR)/./core/asm/mc_chroma.o: $(DECODER_SRCDIR)/./core/asm/mc_chroma.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/mc_chroma.o $(DECODER_SRCDIR)/./core/asm/mc_chroma.asm
-
-$(DECODER_SRCDIR)/./core/asm/mc_luma.o: $(DECODER_SRCDIR)/./core/asm/mc_luma.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/mc_luma.o $(DECODER_SRCDIR)/./core/asm/mc_luma.asm
-
-$(DECODER_SRCDIR)/./core/asm/memzero.o: $(DECODER_SRCDIR)/./core/asm/memzero.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(DECODER_ASMFLAGS) $(DECODER_ASM_INCLUDES) -o $(DECODER_SRCDIR)/./core/asm/memzero.o $(DECODER_SRCDIR)/./core/asm/memzero.asm
$(LIBPREFIX)decoder.$(LIBSUFFIX): $(DECODER_OBJS)
rm -f $(LIBPREFIX)decoder.$(LIBSUFFIX)
--- a/codec/encoder/core/asm/asm_inc.asm
+++ /dev/null
@@ -1,235 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* sse2inc.asm
-;*
-;* Abstract
-;* macro and constant
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
- %define MOVDQ movdqa
-%else
- %define MOVDQ movdqu
-%endif
-
-%if 1
- %define WELSEMMS emms
-%else
- %define WELSEMMS
-%endif
-
-BITS 32
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-%macro WELS_EXTERN 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
-%endmacro
-
-%macro WELS_AbsW 2
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
-
-%macro MMX_XSwap 4
- movq %4, %2
- punpckh%1 %4, %3
- punpckl%1 %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
- MMX_XSwap wd, %1, %2, %5
- MMX_XSwap wd, %3, %4, %2
- MMX_XSwap dq, %1, %3, %4
- MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
- movdqa %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
- SSE2_XSawp dq, %1, %2, %5
- SSE2_XSawp dq, %3, %4, %2
- SSE2_XSawp qdq, %1, %3, %4
- SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
- SSE2_XSawp wd, %1, %2, %5
- SSE2_XSawp wd, %3, %4, %2
- SSE2_XSawp dq, %1, %3, %4
- SSE2_XSawp dq, %5, %2, %3
- SSE2_XSawp qdq, %1, %5, %2
- SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in: m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
- movdqa %9, %8
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %9, %4
- SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %9
- movdqa %9, %3
- SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %9
- movdqa %9, %5
- SSE2_XSawp dq, %7, %3, %5
-
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %9
- movdqa %9, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
- movq %1, %4
- punpcklbw %1, %3
- movq %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
- movdqa %3, %2
- paddw %2, %1
- psubw %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro SSE2_Copy8Times 2
- movd %1, %2
- punpcklwd %1, %1
- pshufd %1, %1, 0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro SSE2_Copy16Times 2
- movd %1, %2
- pshuflw %1, %1, 0
- punpcklqdq %1, %1
- packuswb %1, %1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
- pcmpeqw %1,%1
- psrlw %1,15
- psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
- pcmpeqw %1,%1
- psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro WELS_Zero 1
- pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
- pcmpeqw %1,%1
- psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
- pcmpeqw %1,%1
- psrlw %1,15
- packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -44,7 +44,7 @@
-
+%ifdef X86_32
SECTION .rodata align=16
align 16
@@ -456,4 +456,5 @@
pop esi
pop edi
pop ebx
- ret
\ No newline at end of file
+ ret
+%endif
--- a/codec/encoder/core/asm/cpuid.asm
+++ /dev/null
@@ -1,169 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* cpu_mmx.asm
-;*
-;* Abstract
-;* verify cpuid feature support and cpuid detection
-;*
-;* History
-;* 04/29/2009 Created
-;*
-;*************************************************************************/
-
-bits 32
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-%macro WELS_EXTERN 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
-%endmacro
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
-;******************************************************************************************
-; int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WelsCPUIdVerify:
- pushfd ; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
- pushfd ; need push 2 EFLAGS, one for processing and the another one for storing purpose
- pop ecx ; get EFLAGS to bit manipulation
- mov eax, ecx ; store into ecx followed
- xor eax, 00200000h ; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
- xor eax, ecx ; get the ID flag bitwise, eax - 0: not support; otherwise: support
- popfd ; store back EFLAGS and keep unchanged for system
- ret
-
-WELS_EXTERN WelsCPUId
-ALIGN 16
-;****************************************************************************************************
-; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
-;****************************************************************************************************
-WelsCPUId:
- push ebx
- push edi
-
- mov eax, [esp+12] ; operating index
- cpuid ; cpuid
-
- ; processing various information return
- mov edi, [esp+16]
- mov [edi], eax
- mov edi, [esp+20]
- mov [edi], ebx
- mov edi, [esp+24]
- mov [edi], ecx
- mov edi, [esp+28]
- mov [edi], edx
-
- pop edi
- pop ebx
- ret
-
-WELS_EXTERN WelsCPUSupportAVX
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportAVX:
- mov eax, [esp+4]
- mov ecx, [esp+8]
-
- ; refer to detection of AVX addressed in INTEL AVX manual document
- and ecx, 018000000H
- cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
- jne avx_not_supported
- ; processor supports AVX instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne avx_not_supported
- mov eax, 1
- ret
-avx_not_supported:
- mov eax, 0
- ret
-
-WELS_EXTERN WelsCPUSupportFMA
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportFMA:
- mov eax, [esp+4]
- mov ecx, [esp+8]
-
- ; refer to detection of FMA addressed in INTEL AVX manual document
- and ecx, 018001000H
- cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
- jne fma_not_supported
- ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne fma_not_supported
- mov eax, 1
- ret
-fma_not_supported:
- mov eax, 0
- ret
-
-WELS_EXTERN WelsEmms
-ALIGN 16
-;******************************************************************************************
-; void WelsEmms()
-;******************************************************************************************
-WelsEmms:
- emms ; empty mmx technology states
- ret
-
-
-
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -42,8 +42,6 @@
%include "asm_inc.asm"
-BITS 32
-
SECTION .rodata align=16
;***********************************************************************
@@ -131,7 +129,7 @@
packuswb %1, %2
movd %5, %1
%endmacro
-
+SECTION .text
ALIGN 16
;***********************************************************************
; void __cdecl WelsDctT4_mmx( int16_t *pDct[4], uint8_t *pix1, int32_t i_pix1, uint8_t *pix2, int32_t i_pix2 )
@@ -138,15 +136,20 @@
;***********************************************************************
WELS_EXTERN WelsDctT4_mmx
WelsDctT4_mmx:
- push ebx
- mov eax, [esp+12] ; pix1
- mov ebx, [esp+16] ; i_pix1
- mov ecx, [esp+20] ; pix2
- mov edx, [esp+24] ; i_pix2
-
+ ;push ebx
+ ;mov eax, [esp+12] ; pix1
+ ;mov ebx, [esp+16] ; i_pix1
+ ;mov ecx, [esp+20] ; pix2
+ ;mov edx, [esp+24] ; i_pix2
+ %assign push_num 0
+ LOAD_5_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ movsx r4, r4d
+ %endif
WELS_Zero mm7
- MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
+ MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
@@ -154,14 +157,14 @@
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
- mov eax, [esp+ 8] ; pDct
- movq [eax+ 0], mm2
- movq [eax+ 8], mm1
- movq [eax+16], mm5
- movq [eax+24], mm4
-
- WELSEMMS
- pop ebx
+ ;mov eax, [esp+ 8] ; pDct
+ movq [r0+ 0], mm2
+ movq [r0+ 8], mm1
+ movq [r0+16], mm5
+ movq [r0+24], mm4
+ WELSEMMS
+ LOAD_5_PARA_POP
+ ;pop ebx
ret
@@ -170,24 +173,29 @@
;***********************************************************************
WELS_EXTERN WelsIDctT4Rec_mmx
WelsIDctT4Rec_mmx:
- push ebx
-%define pushsize 4
-%define p_dst esp+pushsize+4
-%define i_dst esp+pushsize+8
-%define p_pred esp+pushsize+12
-%define i_pred esp+pushsize+16
-%define pDct esp+pushsize+20
+ ;push ebx
+;%define pushsize 4
+;%define p_dst esp+pushsize+4
+;%define i_dst esp+pushsize+8
+;%define p_pred esp+pushsize+12
+;%define i_pred esp+pushsize+16
+;%define pDct esp+pushsize+20
+ %assign push_num 0
+ LOAD_5_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ %endif
+; mov eax, [pDct ]
+ movq mm0, [r4+ 0]
+ movq mm1, [r4+ 8]
+ movq mm2, [r4+16]
+ movq mm3, [r4+24]
+ ;mov edx, [p_dst ] ; r0
+ ;mov ecx, [i_dst ] ; r1
+ ;mov eax, [p_pred] ; r2
+ ;mov ebx, [i_pred] ; r3
- mov eax, [pDct ]
- movq mm0, [eax+ 0]
- movq mm1, [eax+ 8]
- movq mm2, [eax+16]
- movq mm3, [eax+24]
- mov edx, [p_dst ]
- mov ecx, [i_dst ]
- mov eax, [p_pred]
- mov ebx, [i_pred]
-
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
@@ -196,21 +204,22 @@
WELS_Zero mm7
WELS_DW32 mm6
- MMX_StoreDiff4P mm3, mm0, mm6, mm7, [edx], [eax]
- MMX_StoreDiff4P mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
- lea edx, [edx+2*ecx]
- lea eax, [eax+2*ebx]
- MMX_StoreDiff4P mm1, mm0, mm6, mm7, [edx], [eax]
- MMX_StoreDiff4P mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
+ MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
+ MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
WELSEMMS
-%undef pushsize
-%undef p_dst
-%undef i_dst
-%undef p_pred
-%undef i_pred
-%undef pDct
- pop ebx
+ LOAD_5_PARA_POP
+;%undef pushsize
+;%undef p_dst
+;%undef i_dst
+;%undef p_pred
+;%undef i_pred
+;%undef pDct
+; pop ebx
ret
@@ -314,23 +323,27 @@
WELS_EXTERN WelsDctFourT4_sse2
ALIGN 16
WelsDctFourT4_sse2:
- push ebx
- push esi
- mov esi, [esp+12]
- mov eax, [esp+16] ; pix1
- mov ebx, [esp+20] ; i_pix1
- mov ecx, [esp+24] ; pix2
- mov edx, [esp+28] ; i_pix2
-
+ ;push ebx
+ ;push esi
+ ;mov esi, [esp+12]
+ ;mov eax, [esp+16] ; pix1
+ ;mov ebx, [esp+20] ; i_pix1
+ ;mov ecx, [esp+24] ; pix2
+ ;mov edx, [esp+28] ; i_pix2
+ %assign push_num 0
+ LOAD_5_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ movsx r4, r4d
+ %endif
pxor xmm7, xmm7
-
;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [eax ], [ecx]
- SSE2_LoadDiff8P xmm1, xmm6, xmm7, [eax+ebx ], [ecx+edx]
- lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
- SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
@@ -337,18 +350,18 @@
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
- SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
- lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
;Load 4x8
- SSE2_LoadDiff8P xmm0, xmm6, xmm7, [eax ], [ecx ]
- SSE2_LoadDiff8P xmm1, xmm6, xmm7, [eax+ebx ], [ecx+edx]
- lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
- SSE2_LoadDiff8P xmm2, xmm6, xmm7, [eax], [ecx]
- SSE2_LoadDiff8P xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
+ SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
+ SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
+ lea r1, [r1 + 2 * r2]
+ lea r3, [r3 + 2 * r4]
+ SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
+ SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
@@ -355,19 +368,20 @@
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
- lea esi, [esi+64]
- SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+ lea r0, [r0+64]
+ SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
- pop esi
- pop ebx
+ ;pop esi
+ ;pop ebx
+ LOAD_5_PARA_POP
ret
-%define rec esp + pushsize + 4
-%define stride esp + pushsize + 8
-%define pred esp + pushsize + 12
-%define pred_stride esp + pushsize + 16
-%define rs esp + pushsize + 20
+;%define rec esp + pushsize + 4
+;%define stride esp + pushsize + 8
+;%define pred esp + pushsize + 12
+;%define pred_stride esp + pushsize + 16
+;%define rs esp + pushsize + 20
;***********************************************************************
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
;***********************************************************************
@@ -374,18 +388,23 @@
WELS_EXTERN WelsIDctFourT4Rec_sse2
ALIGN 16
WelsIDctFourT4Rec_sse2:
-%define pushsize 8
- push ebx
- push esi
+;%define pushsize 8
+; push ebx
+; push esi
- mov eax, [rec]
- mov ebx, [stride]
- mov ecx, [pred]
- mov edx, [pred_stride]
- mov esi, [rs]
-
+; mov eax, [rec]
+; mov ebx, [stride]
+; mov ecx, [pred]
+; mov edx, [pred_stride]
+; mov esi, [rs]
+ %assign push_num 0
+ LOAD_5_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ %endif
;Load 4x8
- SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
@@ -395,17 +414,17 @@
WELS_Zero xmm7
WELS_DW32 xmm6
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
- lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
- add esi, 64
- lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
- SSE2_Load4x8p esi, xmm0, xmm1, xmm4, xmm2, xmm5
+ add r4, 64
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
@@ -415,15 +434,15 @@
WELS_Zero xmm7
WELS_DW32 xmm6
- SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [eax ], [ecx]
- SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [eax + ebx ], [ecx + edx]
- lea eax, [eax + 2 * ebx]
- lea ecx, [ecx + 2 * edx]
- SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [eax], [ecx]
- SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [eax + ebx], [ecx + edx]
-
- pop esi
- pop ebx
+ SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
+ SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
+ SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
+ LOAD_5_PARA_POP
+ ; pop esi
+ ; pop ebx
ret
%macro SSE2_StoreDiff4x8p 8
@@ -438,54 +457,60 @@
;***********************************************************************
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
ALIGN 16
-%define pushsize 8
-%define luma_dc esp + pushsize + 20
+;%define pushsize 8
+;%define luma_dc esp + pushsize + 20
WelsIDctRecI16x16Dc_sse2:
- push esi
- push edi
+ %assign push_num 0
+ LOAD_5_PARA
+ %ifndef X86_32
+ movsx r1, r1d
+ movsx r3, r3d
+ %endif
+ ; push esi
+ ; push edi
- mov ecx, [luma_dc]
- mov eax, [rec]
- mov edx, [stride]
- mov esi, [pred]
- mov edi, [pred_stride]
+ ;mov ecx, [luma_dc] ; r4
+ ;mov eax, [rec] ; r0
+ ;mov edx, [stride] ; r1
+ ;mov esi, [pred]; r2
+ ;mov edi, [pred_stride]; r3
pxor xmm7, xmm7
WELS_DW32 xmm6
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+ SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
- lea eax, [eax + 2 * edx]
- lea esi, [esi + 2 * edi]
- SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-
- pop edi
- pop esi
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
+ LOAD_5_PARA_POP
+ ;pop edi
+ ;pop esi
ret
@@ -503,17 +528,16 @@
movdqa %4, %1
psubd %4, %2
%endmacro
-
%macro SSE2_Load4Col 5
- movsx edx, WORD[%5]
- movd %1, edx
- movsx edx, WORD[%5 + 0x20]
- movd %2, edx
+ movsx r2, WORD[%5]
+ movd %1, r2d
+ movsx r2, WORD[%5 + 0x20]
+ movd %2, r2d
punpckldq %1, %2
- movsx edx, WORD[%5 + 0x80]
- movd %3, edx
- movsx edx, WORD[%5 + 0xa0]
- movd %4, edx
+ movsx r2, WORD[%5 + 0x80]
+ movd %3, r2d
+ movsx r2, WORD[%5 + 0xa0]
+ movd %4, r2d
punpckldq %3, %4
punpcklqdq %1, %3
%endmacro
@@ -523,14 +547,15 @@
;***********************************************************************
WELS_EXTERN WelsHadamardT4Dc_sse2
WelsHadamardT4Dc_sse2:
- mov eax, [esp + 4] ; luma_dc
- mov ecx, [esp + 8] ; pDct
+ ;mov eax, [esp + 4] ; luma_dc
+ ;mov ecx, [esp + 8] ; pDct
+ %assign push_num 0
+ LOAD_2_PARA
+ SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
+ SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
+ SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
+ SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
- SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, ecx
- SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, ecx + 0x40
- SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, ecx + 0x100
- SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-
SSE2_SumSubD xmm1, xmm2, xmm7
SSE2_SumSubD xmm3, xmm4, xmm7
SSE2_SumSubD xmm2, xmm4, xmm7
@@ -548,9 +573,7 @@
packssdw xmm3, xmm4
packssdw xmm2, xmm1
- movdqa [eax+ 0], xmm3
- movdqa [eax+16], xmm2
+ movdqa [r0+ 0], xmm3
+ movdqa [r0+16], xmm2
ret
-
-
--- a/codec/encoder/core/asm/deblock.asm
+++ /dev/null
@@ -1,2113 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* deblock.asm
-;*
-;* Abstract
-;* edge loop
-;*
-;* History
-;* 08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-; void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN DeblockChromaEq4V_sse2
-
-ALIGN 16
-DeblockChromaEq4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,68h
- mov edx,[ebp+10h] ; iStride
- mov eax,[ebp+8] ; pPixCb
- mov ecx,[ebp+0Ch] ; pPixCr
- movq xmm4,[ecx]
- movq xmm5,[edx+ecx]
- push esi
- push edi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- movq xmm1,[edi]
- mov edi,ecx
- sub edi,esi
- movq xmm2,[edi]
- punpcklqdq xmm1,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm2,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm3,[edi]
- punpcklqdq xmm2,xmm3
- movq xmm3,[eax]
- punpcklqdq xmm3,xmm4
- movq xmm4,[edx+eax]
- mov edx, [ebp + 14h]
- punpcklqdq xmm4,xmm5
- movd xmm5,edx
- mov edx, [ebp + 18h]
- pxor xmm0,xmm0
- movdqa xmm6,xmm5
- punpcklwd xmm6,xmm5
- pshufd xmm5,xmm6,0
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,xmm1
- punpckhbw xmm1,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+40h],xmm1
- movdqa [esp+60h],xmm7
- movdqa xmm7,xmm2
- punpcklbw xmm7,xmm0
- movdqa [esp+10h],xmm7
- movdqa xmm7,xmm3
- punpcklbw xmm7,xmm0
- punpckhbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm7,xmm4
- punpckhbw xmm4,xmm0
- punpckhbw xmm2,xmm0
- punpcklbw xmm7,xmm0
- movdqa [esp+30h],xmm3
- movdqa xmm3,[esp+10h]
- movdqa xmm1,xmm3
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa [esp+20h],xmm4
- movdqa xmm0,xmm5
- pcmpgtw xmm0,xmm1
- movdqa xmm1,[esp+60h]
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- pand xmm0,xmm4
- movdqa xmm1,xmm7
- psubw xmm1,[esp+50h]
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,xmm2
- psubw xmm1,[esp+30h]
- pabsw xmm1,xmm1
- pcmpgtw xmm5,xmm1
- movdqa xmm1,[esp+40h]
- pand xmm0,xmm4
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- movdqa xmm4,xmm6
- pcmpgtw xmm4,xmm1
- movdqa xmm1,[esp+20h]
- psubw xmm1,[esp+30h]
- pand xmm5,xmm4
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- pand xmm5,xmm6
- mov edx,2
- movsx edx,dx
- movd xmm1,edx
- movdqa xmm4,xmm1
- punpcklwd xmm4,xmm1
- pshufd xmm1,xmm4,0
- movdqa xmm4,[esp+60h]
- movdqa xmm6,xmm4
- paddw xmm6,xmm4
- paddw xmm6,xmm3
- paddw xmm6,xmm7
- movdqa [esp+10h],xmm1
- paddw xmm6,[esp+10h]
- psraw xmm6,2
- movdqa xmm4,xmm0
- pandn xmm4,xmm3
- movdqa xmm3,[esp+40h]
- movdqa xmm1,xmm0
- pand xmm1,xmm6
- por xmm1,xmm4
- movdqa xmm6,xmm3
- paddw xmm6,xmm3
- movdqa xmm3,[esp+10h]
- paddw xmm6,xmm2
- paddw xmm6,[esp+20h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm4,xmm5
- pand xmm4,xmm6
- movdqa xmm6,xmm5
- pandn xmm6,xmm2
- por xmm4,xmm6
- packuswb xmm1,xmm4
- movdqa xmm4,[esp+50h]
- movdqa xmm6,xmm7
- paddw xmm6,xmm7
- paddw xmm6,xmm4
- paddw xmm6,[esp+60h]
- paddw xmm6,xmm3
- psraw xmm6,2
- movdqa xmm2,xmm0
- pand xmm2,xmm6
- pandn xmm0,xmm4
- por xmm2,xmm0
- movdqa xmm0,[esp+20h]
- movdqa xmm6,xmm0
- paddw xmm6,xmm0
- movdqa xmm0,[esp+30h]
- paddw xmm6,xmm0
- paddw xmm6,[esp+40h]
- movdqa xmm4,xmm5
- paddw xmm6,xmm3
- movq [esi],xmm1
- psraw xmm6,2
- pand xmm4,xmm6
- pandn xmm5,xmm0
- por xmm4,xmm5
- packuswb xmm2,xmm4
- movq [eax],xmm2
- psrldq xmm1,8
- movq [edi],xmm1
- pop edi
- psrldq xmm2,8
- movq [ecx],xmm2
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0E4h
- push ebx
- push esi
- mov esi, [ebp+1Ch] ; pTC
- movsx ebx, byte [esi+2]
- push edi
- movsx di,byte [esi+3]
- mov word [esp+0Ch],bx
- movsx bx,byte [esi+1]
- movsx esi,byte [esi]
- mov word [esp+0Eh],si
- movzx esi,di
- movd xmm1,esi
- movzx esi,di
- movd xmm2,esi
- mov si,word [esp+0Ch]
- mov edx, [ebp + 10h]
- mov eax, [ebp + 08h]
- movzx edi,si
- movzx esi,si
- mov ecx, [ebp + 0Ch]
- movd xmm4,esi
- movzx esi,bx
- movd xmm5,esi
- movd xmm3,edi
- movzx esi,bx
- movd xmm6,esi
- mov si,word [esp+0Eh]
- movzx edi,si
- movzx esi,si
- punpcklwd xmm6,xmm2
- pxor xmm0,xmm0
- movdqa [esp+40h],xmm0
- movd xmm7,edi
- movd xmm0,esi
- lea esi,[edx+edx]
- mov edi,eax
- sub edi,esi
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+40h]
- punpcklwd xmm0,xmm4
- movq xmm4,[edx+ecx]
- punpcklwd xmm7,xmm3
- movq xmm3,[eax]
- punpcklwd xmm0,xmm6
- movq xmm6,[edi]
- punpcklwd xmm7,xmm5
- punpcklwd xmm0,xmm7
- mov edi,ecx
- sub edi,esi
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+60h],xmm2
- movq xmm2, [edi]
- punpcklqdq xmm6,xmm2
- mov esi,eax
- sub esi,edx
- movq xmm7,[esi]
- mov edi,ecx
- sub edi,edx
- movq xmm2,[edi]
- punpcklqdq xmm7,xmm2
- movq xmm2,[ecx]
- punpcklqdq xmm3,xmm2
- movq xmm2,[edx+eax]
- movsx edx,word [ebp + 14h]
- punpcklqdq xmm2,xmm4
- movdqa [esp+0E0h],xmm2
- movd xmm2,edx
- movsx edx,word [ebp + 18h]
- movdqa xmm4,xmm2
- punpcklwd xmm4,xmm2
- movd xmm2,edx
- movdqa xmm5,xmm2
- punpcklwd xmm5,xmm2
- pshufd xmm2,xmm5,0
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- movdqa [esp+0D0h],xmm3
- pshufd xmm4,xmm4,0
- movdqa [esp+30h],xmm2
- punpckhbw xmm6,xmm1
- movdqa [esp+80h],xmm6
- movdqa xmm6,[esp+0D0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+70h],xmm6
- movdqa xmm6, [esp+0E0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+90h],xmm6
- movdqa xmm5, [esp+0E0h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0A0h],xmm7
- punpcklbw xmm3,xmm1
- mov edx,4
- punpcklbw xmm2,xmm1
- movsx edx,dx
- movd xmm6,edx
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa xmm7,[esp+30h]
- movdqa [esp+20h],xmm6
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa xmm1,[esp+60h]
- movdqa [esp+40h],xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6, [esp+20h]
- movdqa xmm7, [esp+50h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa [esp+10h],xmm0
- movdqa xmm6, [esp+10h]
- pminsw xmm6,xmm1
- movdqa [esp+10h],xmm6
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- movdqa xmm6,xmm4
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+30h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1,[esp+50h]
- pand xmm6,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5,[esp+80h]
- psubw xmm5,[esp+90h]
- pand xmm6,xmm1
- pand xmm6,[esp+40h]
- movdqa xmm1,[esp+10h]
- pand xmm1,xmm6
- movdqa xmm6,[esp+70h]
- movdqa [esp+30h],xmm1
- movdqa xmm1,[esp+0A0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6,[esp+20h]
- movdqa xmm5,[esp+60h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+70h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+80h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+90h]
- pand xmm4,xmm7
- movdqa xmm7,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+40h]
- pand xmm0,xmm4
- movdqa xmm4,[esp+30h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- packuswb xmm2,xmm1
- movq [esi],xmm2
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm3,xmm5
- movq [eax],xmm3
- psrldq xmm2,8
- movq [edi],xmm2
- pop edi
- pop esi
- psrldq xmm3,8
- movq [ecx],xmm3
- pop ebx
- mov esp,ebp
- pop ebp
- ret
-
-;***************************************************************************
-; void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN DeblockChromaEq4H_sse2
-
-ALIGN 16
-
-DeblockChromaEq4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,0C8h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+18h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+7Ch]
- push edi
- mov dword [esp+14h],esi
- mov dword [esp+18h],ecx
- mov dword [esp+0Ch],edx
- mov dword [esp+10h],eax
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+0Ch]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+10h]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- movsx ecx,word [ebp+14h]
- movsx edx,word [ebp+18h]
- movdqa xmm6,[esp+80h]
- movdqa xmm4,[esp+90h]
- movdqa xmm5,[esp+0A0h]
- movdqa xmm7,[esp+0B0h]
- pxor xmm0,xmm0
- movd xmm1,ecx
- movdqa xmm2,xmm1
- punpcklwd xmm2,xmm1
- pshufd xmm1,xmm2,0
- movd xmm2,edx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3,xmm6
- punpckhbw xmm6,xmm0
- movdqa [esp+60h],xmm6
- movdqa xmm6,[esp+90h]
- punpckhbw xmm6,xmm0
- movdqa [esp+30h],xmm6
- movdqa xmm6,[esp+0A0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+40h],xmm6
- movdqa xmm6,[esp+0B0h]
- punpckhbw xmm6,xmm0
- movdqa [esp+70h],xmm6
- punpcklbw xmm7,xmm0
- punpcklbw xmm4,xmm0
- punpcklbw xmm5,xmm0
- punpcklbw xmm3,xmm0
- movdqa [esp+50h],xmm7
- movdqa xmm6,xmm4
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- movdqa xmm0,xmm1
- pcmpgtw xmm0,xmm6
- movdqa xmm6,xmm3
- psubw xmm6,xmm4
- pabsw xmm6,xmm6
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+30h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pcmpgtw xmm1,xmm6
- movdqa xmm6,[esp+60h]
- psubw xmm6,[esp+30h]
- pabsw xmm6,xmm6
- pand xmm0,xmm7
- movdqa xmm7,xmm2
- pcmpgtw xmm7,xmm6
- movdqa xmm6,[esp+70h]
- psubw xmm6,[esp+40h]
- pabsw xmm6,xmm6
- pand xmm1,xmm7
- pcmpgtw xmm2,xmm6
- pand xmm1,xmm2
- mov eax,2
- movsx ecx,ax
- movd xmm2,ecx
- movdqa xmm6,xmm2
- punpcklwd xmm6,xmm2
- pshufd xmm2,xmm6,0
- movdqa [esp+20h],xmm2
- movdqa xmm2,xmm3
- paddw xmm2,xmm3
- paddw xmm2,xmm4
- paddw xmm2,[esp+50h]
- paddw xmm2,[esp+20h]
- psraw xmm2,2
- movdqa xmm6,xmm0
- pand xmm6,xmm2
- movdqa xmm2,xmm0
- pandn xmm2,xmm4
- por xmm6,xmm2
- movdqa xmm2,[esp+60h]
- movdqa xmm7,xmm2
- paddw xmm7,xmm2
- paddw xmm7,[esp+30h]
- paddw xmm7,[esp+70h]
- paddw xmm7,[esp+20h]
- movdqa xmm4,xmm1
- movdqa xmm2,xmm1
- pandn xmm2,[esp+30h]
- psraw xmm7,2
- pand xmm4,xmm7
- por xmm4,xmm2
- movdqa xmm2,[esp+50h]
- packuswb xmm6,xmm4
- movdqa [esp+90h],xmm6
- movdqa xmm6,xmm2
- paddw xmm6,xmm2
- movdqa xmm2,[esp+20h]
- paddw xmm6,xmm5
- paddw xmm6,xmm3
- movdqa xmm4,xmm0
- pandn xmm0,xmm5
- paddw xmm6,xmm2
- psraw xmm6,2
- pand xmm4,xmm6
- por xmm4,xmm0
- movdqa xmm0,[esp+70h]
- movdqa xmm5,xmm0
- paddw xmm5,xmm0
- movdqa xmm0,[esp+40h]
- paddw xmm5,xmm0
- paddw xmm5,[esp+60h]
- movdqa xmm3,xmm1
- paddw xmm5,xmm2
- psraw xmm5,2
- pand xmm3,xmm5
- pandn xmm1,xmm0
- por xmm3,xmm1
- packuswb xmm4,xmm3
- movdqa [esp+0A0h],xmm4
- mov esi,dword [esp+10h]
- movdqa xmm0,[esi]
- movdqa xmm1,[esi+10h]
- movdqa xmm2,[esi+20h]
- movdqa xmm3,[esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+1Ch]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+14h]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+0Ch]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-;*******************************************************************************
-; void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-; int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN DeblockChromaLt4H_sse2
-
-ALIGN 16
-
-DeblockChromaLt4H_sse2:
- push ebp
- mov ebp,esp
- and esp,0FFFFFFF0h
- sub esp,108h
- mov ecx,dword [ebp+8]
- mov edx,dword [ebp+0Ch]
- mov eax,dword [ebp+10h]
- sub ecx,2
- sub edx,2
- push esi
- lea esi,[eax+eax*2]
- mov dword [esp+10h],ecx
- mov dword [esp+4],edx
- lea ecx,[ecx+eax*4]
- lea edx,[edx+eax*4]
- lea eax,[esp+6Ch]
- push edi
- mov dword [esp+0Ch],esi
- mov dword [esp+18h],ecx
- mov dword [esp+10h],edx
- mov dword [esp+1Ch],eax
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- movd xmm0,dword [esi]
- movd xmm1,dword [esi+ecx]
- movd xmm2,dword [esi+ecx*2]
- movd xmm3,dword [esi+edx]
- mov esi,dword [esp+8]
- movd xmm4,dword [esi]
- movd xmm5,dword [esi+ecx]
- movd xmm6,dword [esi+ecx*2]
- movd xmm7,dword [esi+edx]
- punpckldq xmm0,xmm4
- punpckldq xmm1,xmm5
- punpckldq xmm2,xmm6
- punpckldq xmm3,xmm7
- mov esi,dword [esp+18h]
- mov edi,dword [esp+10h]
- movd xmm4,dword [esi]
- movd xmm5,dword [edi]
- punpckldq xmm4,xmm5
- punpcklqdq xmm0,xmm4
- movd xmm4,dword [esi+ecx]
- movd xmm5,dword [edi+ecx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm1,xmm4
- movd xmm4,dword [esi+ecx*2]
- movd xmm5,dword [edi+ecx*2]
- punpckldq xmm4,xmm5
- punpcklqdq xmm2,xmm4
- movd xmm4,dword [esi+edx]
- movd xmm5,dword [edi+edx]
- punpckldq xmm4,xmm5
- punpcklqdq xmm3,xmm4
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov edi,dword [esp+1Ch]
- movdqa [edi],xmm0
- movdqa [edi+10h],xmm5
- movdqa [edi+20h],xmm1
- movdqa [edi+30h],xmm6
- mov eax,dword [ebp+1Ch]
- movsx cx,byte [eax+3]
- movsx dx,byte [eax+2]
- movsx si,byte [eax+1]
- movsx ax,byte [eax]
- movzx edi,cx
- movzx ecx,cx
- movd xmm2,ecx
- movzx ecx,dx
- movzx edx,dx
- movd xmm3,ecx
- movd xmm4,edx
- movzx ecx,si
- movzx edx,si
- movd xmm5,ecx
- pxor xmm0,xmm0
- movd xmm6,edx
- movzx ecx,ax
- movdqa [esp+60h],xmm0
- movzx edx,ax
- movsx eax,word [ebp+14h]
- punpcklwd xmm6,xmm2
- movd xmm1,edi
- movd xmm7,ecx
- movsx ecx,word [ebp+18h]
- movd xmm0,edx
- punpcklwd xmm7,xmm3
- punpcklwd xmm5,xmm1
- movdqa xmm1,[esp+60h]
- punpcklwd xmm7,xmm5
- movdqa xmm5,[esp+0A0h]
- punpcklwd xmm0,xmm4
- punpcklwd xmm0,xmm6
- movdqa xmm6, [esp+70h]
- punpcklwd xmm0,xmm7
- movdqa xmm7,[esp+80h]
- movdqa xmm2,xmm1
- psubw xmm2,xmm0
- movdqa [esp+0D0h],xmm2
- movd xmm2,eax
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm4,xmm3,0
- movd xmm2,ecx
- movdqa xmm3,xmm2
- punpcklwd xmm3,xmm2
- pshufd xmm2,xmm3,0
- movdqa xmm3, [esp+90h]
- movdqa [esp+50h],xmm2
- movdqa xmm2,xmm6
- punpcklbw xmm2,xmm1
- punpckhbw xmm6,xmm1
- movdqa [esp+40h],xmm2
- movdqa [esp+0B0h],xmm6
- movdqa xmm6,[esp+90h]
- movdqa xmm2,xmm7
- punpckhbw xmm7,xmm1
- punpckhbw xmm6,xmm1
- punpcklbw xmm2,xmm1
- punpcklbw xmm3,xmm1
- punpcklbw xmm5,xmm1
- movdqa [esp+0F0h],xmm7
- movdqa [esp+0C0h],xmm6
- movdqa xmm6, [esp+0A0h]
- punpckhbw xmm6,xmm1
- movdqa [esp+0E0h],xmm6
- mov edx,4
- movsx eax,dx
- movd xmm6,eax
- movdqa xmm7,xmm6
- punpcklwd xmm7,xmm6
- pshufd xmm6,xmm7,0
- movdqa [esp+30h],xmm6
- movdqa xmm7, [esp+40h]
- psubw xmm7,xmm5
- movdqa xmm6,xmm0
- pcmpgtw xmm6,xmm1
- movdqa [esp+60h],xmm6
- movdqa xmm1, [esp+0D0h]
- movdqa xmm6,xmm3
- psubw xmm6,xmm2
- psllw xmm6,2
- paddw xmm6,xmm7
- paddw xmm6,[esp+30h]
- psraw xmm6,3
- pmaxsw xmm1,xmm6
- movdqa xmm7,[esp+50h]
- movdqa [esp+20h],xmm0
- movdqa xmm6, [esp+20h]
- pminsw xmm6,xmm1
- movdqa [esp+20h],xmm6
- movdqa xmm6,xmm4
- movdqa xmm1,xmm2
- psubw xmm1,xmm3
- pabsw xmm1,xmm1
- pcmpgtw xmm6,xmm1
- movdqa xmm1, [esp+40h]
- psubw xmm1,xmm2
- pabsw xmm1,xmm1
- pcmpgtw xmm7,xmm1
- movdqa xmm1, [esp+50h]
- pand xmm6,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm5,xmm3
- pabsw xmm5,xmm5
- pcmpgtw xmm1,xmm5
- movdqa xmm5, [esp+0B0h]
- psubw xmm5,[esp+0E0h]
- pand xmm6,xmm1
- pand xmm6, [esp+60h]
- movdqa xmm1, [esp+20h]
- pand xmm1,xmm6
- movdqa xmm6, [esp+0C0h]
- movdqa [esp+40h],xmm1
- movdqa xmm1, [esp+0F0h]
- psubw xmm6,xmm1
- psllw xmm6,2
- paddw xmm6,xmm5
- paddw xmm6, [esp+30h]
- movdqa xmm5, [esp+0D0h]
- psraw xmm6,3
- pmaxsw xmm5,xmm6
- pminsw xmm0,xmm5
- movdqa xmm5,[esp+0C0h]
- movdqa xmm6,xmm1
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm4,xmm6
- movdqa xmm6,[esp+0B0h]
- psubw xmm6,xmm1
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- movdqa xmm6, [esp+0E0h]
- pand xmm4,xmm7
- movdqa xmm7, [esp+50h]
- psubw xmm6,xmm5
- pabsw xmm6,xmm6
- pcmpgtw xmm7,xmm6
- pand xmm4,xmm7
- pand xmm4,[esp+60h]
- pand xmm0,xmm4
- movdqa xmm4, [esp+40h]
- paddw xmm2,xmm4
- paddw xmm1,xmm0
- psubw xmm3,xmm4
- psubw xmm5,xmm0
- packuswb xmm2,xmm1
- packuswb xmm3,xmm5
- movdqa [esp+80h],xmm2
- movdqa [esp+90h],xmm3
- mov esi,dword [esp+1Ch]
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+10h]
- movdqa xmm2, [esi+20h]
- movdqa xmm3, [esi+30h]
- movdqa xmm6,xmm0
- punpcklbw xmm0,xmm1
- punpckhbw xmm6,xmm1
- movdqa xmm7,xmm2
- punpcklbw xmm2,xmm3
- punpckhbw xmm7,xmm3
- movdqa xmm4,xmm0
- movdqa xmm5,xmm6
- punpcklwd xmm0,xmm2
- punpckhwd xmm4,xmm2
- punpcklwd xmm6,xmm7
- punpckhwd xmm5,xmm7
- movdqa xmm1,xmm0
- movdqa xmm2,xmm4
- punpckldq xmm0,xmm6
- punpckhdq xmm1,xmm6
- punpckldq xmm4,xmm5
- punpckhdq xmm2,xmm5
- movdqa xmm5,xmm0
- movdqa xmm6,xmm1
- punpcklqdq xmm0,xmm4
- punpckhqdq xmm5,xmm4
- punpcklqdq xmm1,xmm2
- punpckhqdq xmm6,xmm2
- mov esi,dword [esp+14h]
- mov ecx,dword [ebp+10h]
- mov edx,dword [esp+0Ch]
- mov edi,dword [esp+8]
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov esi,dword [esp+18h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- movd dword [esi],xmm0
- movd dword [esi+ecx],xmm5
- movd dword [esi+ecx*2],xmm1
- movd dword [esi+edx],xmm6
- psrldq xmm0,4
- psrldq xmm5,4
- psrldq xmm1,4
- psrldq xmm6,4
- mov edi,dword [esp+10h]
- movd dword [edi],xmm0
- movd dword [edi+ecx],xmm5
- movd dword [edi+ecx*2],xmm1
- movd dword [edi+edx],xmm6
- pop edi
- pop esi
- mov esp,ebp
- pop ebp
- ret
-
-
-
-;*******************************************************************************
-; void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-
-
-WELS_EXTERN DeblockLumaLt4V_sse2
-
-ALIGN 16
-
-DeblockLumaLt4V_sse2:
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 420 ; 000001a4H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
-
- pxor xmm0, xmm0
- push ebx
- mov edx, dword [ebp+24]
- movdqa [esp+424-384], xmm0
- push esi
-
- lea esi, [ecx+ecx*2]
- push edi
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
-
- lea esi, [ecx+ecx]
- movdqa [esp+432-208], xmm0
- mov edi, eax
- sub edi, esi
- movdqa xmm0, [edi]
- movdqa [esp+448-208], xmm0
-
- mov ebx, eax
- sub ebx, ecx
- movdqa xmm0, [ebx]
- movdqa [esp+464-208], xmm0
-
- movdqa xmm0, [eax]
-
- add ecx, eax
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [ecx]
- mov dword [esp+432-404], ecx
-
- movsx ecx, word [ebp+16]
- movdqa [esp+496-208], xmm0
- movdqa xmm0, [esi+eax]
-
- movsx si, byte [edx]
- movdqa [esp+512-208], xmm0
- movd xmm0, ecx
- movsx ecx, word [ebp+20]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- pshufd xmm0, xmm1, 0
- movdqa [esp+432-112], xmm0
- movd xmm0, ecx
- movsx cx, byte [edx+1]
- movdqa xmm1, xmm0
- punpcklwd xmm1, xmm0
- mov dword [esp+432-408], ebx
- movzx ebx, cx
- pshufd xmm0, xmm1, 0
- movd xmm1, ebx
- movzx ebx, cx
- movd xmm2, ebx
- movzx ebx, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, si
- movd xmm5, ecx
- movzx ecx, si
- movd xmm6, ecx
- movzx ecx, si
- movd xmm7, ecx
- movzx ecx, si
- movdqa [esp+432-336], xmm0
- movd xmm0, ecx
-
- movsx cx, byte [edx+3]
- movsx dx, byte [edx+2]
- movd xmm3, ebx
- punpcklwd xmm0, xmm4
- movzx esi, cx
- punpcklwd xmm6, xmm2
- punpcklwd xmm5, xmm1
- punpcklwd xmm0, xmm6
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- punpcklwd xmm0, xmm7
- movdqa [esp+432-400], xmm0
- movd xmm0, esi
- movzx esi, cx
- movd xmm2, esi
- movzx esi, cx
- movzx ecx, cx
- movd xmm4, ecx
- movzx ecx, dx
- movd xmm3, esi
- movd xmm5, ecx
- punpcklwd xmm5, xmm0
-
- movdqa xmm0, [esp+432-384]
- movzx ecx, dx
- movd xmm6, ecx
- movzx ecx, dx
- movzx edx, dx
- punpcklwd xmm6, xmm2
- movd xmm7, ecx
- movd xmm1, edx
-
- movdqa xmm2, [esp+448-208]
- punpcklbw xmm2, xmm0
-
- mov ecx, 4
- movsx edx, cx
- punpcklwd xmm7, xmm3
- punpcklwd xmm7, xmm5
- movdqa xmm5, [esp+496-208]
- movdqa xmm3, [esp+464-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-240], xmm5
- movdqa xmm5, [esp+512-208]
- punpcklbw xmm5, xmm0
- movdqa [esp+432-352], xmm5
- punpcklwd xmm1, xmm4
- movdqa xmm4, [esp+432-208]
- punpcklwd xmm1, xmm6
- movdqa xmm6, [esp+480-208]
- punpcklwd xmm1, xmm7
- punpcklbw xmm6, xmm0
- punpcklbw xmm3, xmm0
- punpcklbw xmm4, xmm0
- movdqa xmm7, xmm3
- psubw xmm7, xmm4
- pabsw xmm7, xmm7
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-336]
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-352]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
- movdqa xmm5, xmm3
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
- movdqa xmm5, [esp+432-400]
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, xmm3
- movdqa [esp+432-32], xmm6
- psubw xmm6, [esp+432-240]
- movdqa xmm7, xmm5
- movdqa [esp+432-384], xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
-
- pand xmm5, xmm7
- movdqa xmm6, xmm3
- psubw xmm6, xmm2
- pabsw xmm6, xmm6
- movdqa xmm7, xmm4
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-400]
- pand xmm5, xmm7
- movdqa xmm7, xmm6
- pcmpeqw xmm6, xmm0
- pcmpgtw xmm7, xmm0
- por xmm7, xmm6
- pand xmm5, xmm7
- movdqa [esp+432-320], xmm5
- movd xmm5, edx
- movdqa xmm6, xmm5
- punpcklwd xmm6, xmm5
- pshufd xmm5, xmm6, 0
- movdqa [esp+432-336], xmm5
- movdqa xmm5, [esp+432-224]
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm0
- psubw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- psllw xmm5, 2
- movdqa xmm7, xmm2
- psubw xmm7, [esp+432-240]
- paddw xmm7, xmm5
- paddw xmm7, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- psraw xmm7, 3
- pmaxsw xmm6, xmm7
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- movdqa xmm6, [esp+432-400]
- movdqa [esp+432-64], xmm5
- movdqa [esp+432-384], xmm6
- movdqa xmm5, xmm0
- psubw xmm5, xmm6
- movdqa [esp+432-368], xmm5
- movdqa xmm6, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm2
- paddw xmm7, xmm2
- psubw xmm5, xmm7
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-384]
- pminsw xmm5, xmm6
-
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-288]
- movdqa xmm6, [esp+432-240]
- movdqa [esp+432-96], xmm5
- movdqa xmm5, [esp+432-352]
- paddw xmm5, [esp+432-304]
- movdqa xmm7, xmm6
- paddw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
- psubw xmm5, xmm7
-
- movdqa xmm7, [esp+496-208]
- psraw xmm5, 1
- pmaxsw xmm6, xmm5
- movdqa xmm5, [esp+432-400]
- pminsw xmm5, xmm6
- pand xmm5, [esp+432-320]
- pand xmm5, [esp+432-256]
- movdqa xmm6, [esp+448-208]
- punpckhbw xmm7, xmm0
- movdqa [esp+432-352], xmm7
-
- movdqa xmm7, [esp+512-208]
- punpckhbw xmm6, xmm0
- movdqa [esp+432-48], xmm5
- movdqa xmm5, [esp+432-208]
- movdqa [esp+432-368], xmm6
- movdqa xmm6, [esp+464-208]
- punpckhbw xmm7, xmm0
- punpckhbw xmm5, xmm0
- movdqa [esp+432-384], xmm7
- punpckhbw xmm6, xmm0
- movdqa [esp+432-400], xmm6
-
- movdqa xmm7, [esp+432-400]
- movdqa xmm6, [esp+480-208]
- psubw xmm7, xmm5
- movdqa [esp+432-16], xmm5
- pabsw xmm7, xmm7
- punpckhbw xmm6, xmm0
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-288], xmm5
-
- movdqa xmm7, xmm6
- psubw xmm7, [esp+432-384]
- pabsw xmm7, xmm7
- movdqa xmm5, xmm4
- pcmpgtw xmm5, xmm7
- movdqa [esp+432-256], xmm5
-
- movdqa xmm5, [esp+432-400]
- movdqa [esp+432-80], xmm6
- pavgw xmm5, xmm6
- movdqa [esp+432-304], xmm5
-
- movdqa xmm5, xmm1
- psubw xmm5, [esp+432-288]
- psubw xmm5, [esp+432-256]
- movdqa [esp+432-224], xmm5
- movdqa xmm5, xmm6
- psubw xmm5, [esp+432-400]
- psubw xmm6, [esp+432-352]
- movdqa [esp+432-272], xmm5
- movdqa xmm7, xmm5
- movdqa xmm5, [esp+432-112]
- pabsw xmm7, xmm7
- pcmpgtw xmm5, xmm7
- movdqa xmm7, xmm4
- pabsw xmm6, xmm6
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+432-368]
-
- pand xmm5, xmm7
- movdqa xmm7, [esp+432-400]
- psubw xmm7, xmm6
- psubw xmm6, [esp+432-352]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
- pand xmm5, xmm4
-
- paddw xmm2, [esp+432-96]
- movdqa xmm4, xmm1
- pcmpgtw xmm4, xmm0
- movdqa xmm7, xmm1
- pcmpeqw xmm7, xmm0
- por xmm4, xmm7
- pand xmm5, xmm4
- movdqa xmm4, [esp+432-224]
- movdqa [esp+432-320], xmm5
- movdqa xmm5, [esp+432-272]
- movdqa xmm7, xmm0
- psubw xmm7, xmm4
- psubw xmm0, xmm1
- psllw xmm5, 2
- paddw xmm6, xmm5
- paddw xmm6, [esp+432-336]
- movdqa xmm5, [esp+432-368]
- movdqa [esp+432-336], xmm0
- psraw xmm6, 3
- pmaxsw xmm7, xmm6
- pminsw xmm4, xmm7
- pand xmm4, [esp+432-320]
- movdqa xmm6, xmm0
- movdqa xmm0, [esp+432-16]
- paddw xmm0, [esp+432-304]
- movdqa [esp+432-272], xmm4
- movdqa xmm4, [esp+432-368]
- paddw xmm4, xmm4
- psubw xmm0, xmm4
-
- movdqa xmm4, [esp+432-64]
- psraw xmm0, 1
- pmaxsw xmm6, xmm0
- movdqa xmm0, [esp+432-400]
- movdqa xmm7, xmm1
- pminsw xmm7, xmm6
- movdqa xmm6, [esp+432-320]
- pand xmm7, xmm6
- pand xmm7, [esp+432-288]
- paddw xmm5, xmm7
- packuswb xmm2, xmm5
- movdqa xmm5, [esp+432-272]
- paddw xmm0, xmm5
- paddw xmm3, xmm4
- packuswb xmm3, xmm0
-
- movdqa xmm0, [esp+432-32]
- psubw xmm0, xmm4
- movdqa xmm4, [esp+432-80]
- psubw xmm4, xmm5
-
- movdqa xmm5, [esp+432-240]
- paddw xmm5, [esp+432-48]
- packuswb xmm0, xmm4
- movdqa xmm4, [esp+432-384]
- paddw xmm4, [esp+432-304]
- movdqa [esp+480-208], xmm0
- movdqa xmm0, [esp+432-352]
- movdqa xmm7, xmm0
- paddw xmm0, xmm0
-
- mov ecx, dword [esp+432-408]
-
- mov edx, dword [esp+432-404]
- psubw xmm4, xmm0
- movdqa xmm0, [esp+432-336]
- movdqa [edi], xmm2
- psraw xmm4, 1
- pmaxsw xmm0, xmm4
- pminsw xmm1, xmm0
- movdqa xmm0, [esp+480-208]
-
- pop edi
- pand xmm1, xmm6
- pand xmm1, [esp+428-256]
- movdqa [ecx], xmm3
- paddw xmm7, xmm1
- pop esi
- packuswb xmm5, xmm7
- movdqa [eax], xmm0
- movdqa [edx], xmm5
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;*******************************************************************************
-; void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
-; int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN DeblockLumaEq4V_sse2
-
-ALIGN 16
-
-DeblockLumaEq4V_sse2:
-
- push ebp
- mov ebp, esp
- and esp, -16 ; fffffff0H
- sub esp, 628 ; 00000274H
- mov eax, dword [ebp+8]
- mov ecx, dword [ebp+12]
- push ebx
- push esi
-
- lea edx, [ecx*4]
- pxor xmm0, xmm0
- movdqa xmm2, xmm0
-
- movdqa xmm0, [ecx+eax]
- mov esi, eax
- sub esi, edx
- movdqa xmm3, [esi]
- movdqa xmm5, [eax]
- push edi
- lea edi, [ecx+ecx]
- lea ebx, [ecx+ecx*2]
- mov dword [esp+640-600], edi
- mov esi, eax
- sub esi, edi
- movdqa xmm1, [esi]
- movdqa [esp+720-272], xmm0
- mov edi, eax
- sub edi, ecx
- movdqa xmm4, [edi]
- add ecx, eax
- mov dword [esp+640-596], ecx
-
- mov ecx, dword [esp+640-600]
- movdqa xmm0, [ecx+eax]
- movdqa [esp+736-272], xmm0
-
- movdqa xmm0, [eax+ebx]
- mov edx, eax
- sub edx, ebx
-
- movsx ebx, word [ebp+16]
- movdqa xmm6, [edx]
- add ecx, eax
- movdqa [esp+752-272], xmm0
- movd xmm0, ebx
-
- movsx ebx, word [ebp+20]
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
- movdqa [esp+640-320], xmm0
- movd xmm0, ebx
- movdqa xmm7, xmm0
- punpcklwd xmm7, xmm0
- pshufd xmm0, xmm7, 0
-
- movdqa xmm7, [esp+736-272]
- punpcklbw xmm7, xmm2
- movdqa [esp+640-416], xmm7
- movdqa [esp+640-512], xmm0
- movdqa xmm0, xmm1
- movdqa [esp+672-272], xmm1
- movdqa xmm1, xmm4
- movdqa [esp+704-272], xmm5
- punpcklbw xmm5, xmm2
- punpcklbw xmm1, xmm2
-
- movdqa xmm7, xmm5
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- punpcklbw xmm0, xmm2
- movdqa [esp+688-272], xmm4
- movdqa xmm4, [esp+720-272]
- movdqa [esp+640-480], xmm0
-
- movdqa xmm7, xmm1
- psubw xmm7, xmm0
-
- movdqa xmm0, [esp+640-512]
- pabsw xmm7, xmm7
- punpcklbw xmm4, xmm2
- pcmpgtw xmm0, xmm7
- movdqa [esp+640-384], xmm4
- movdqa xmm7, xmm5
- psubw xmm7, xmm4
- movdqa xmm4, [esp+640-512]
- movdqa [esp+656-272], xmm6
- punpcklbw xmm6, xmm2
- pabsw xmm7, xmm7
- movdqa [esp+640-48], xmm2
- movdqa [esp+640-368], xmm6
- movdqa [esp+640-144], xmm1
- movdqa [esp+640-400], xmm5
- pcmpgtw xmm4, xmm7
- pand xmm0, xmm4
- movdqa xmm4, [esp+640-320]
- pcmpgtw xmm4, [esp+640-560]
- pand xmm0, xmm4
-
- mov ebx, 2
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, [esp+640-320]
- psraw xmm4, 2
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm7
- movdqa [esp+640-576], xmm4
- pcmpgtw xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-512]
- movdqa [esp+640-624], xmm7
- movdqa xmm7, xmm1
- psubw xmm7, xmm6
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-544], xmm4
- movdqa xmm4, [esp+640-512]
- movdqa xmm7, xmm5
- psubw xmm7, [esp+640-416]
- pabsw xmm7, xmm7
- pcmpgtw xmm4, xmm7
-
- pand xmm4, [esp+640-560]
- movdqa [esp+640-560], xmm4
-
- movdqa xmm4, [esp+640-544]
- pandn xmm4, xmm6
- movdqa [esp+640-16], xmm4
- mov ebx, 4
- movsx ebx, bx
- movd xmm4, ebx
- movdqa xmm7, xmm4
- punpcklwd xmm7, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm2
- psllw xmm4, 1
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, xmm6
- paddw xmm4, [esp+640-480]
-
- movdqa xmm6, [esp+640-560]
- pshufd xmm7, xmm7, 0
- paddw xmm4, xmm1
- movdqa [esp+640-592], xmm7
- paddw xmm4, xmm5
- paddw xmm4, xmm7
- movdqa xmm7, [esp+640-416]
- pandn xmm6, xmm7
- movdqa [esp+640-80], xmm6
- movdqa xmm6, [esp+752-272]
- punpcklbw xmm6, xmm2
- psllw xmm6, 1
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-384]
-
- movdqa xmm7, [esp+640-480]
- paddw xmm6, xmm5
- paddw xmm6, xmm1
- paddw xmm6, [esp+640-592]
- psraw xmm6, 3
- pand xmm6, [esp+640-560]
- movdqa [esp+640-112], xmm6
- movdqa xmm6, [esp+640-544]
- pandn xmm6, xmm7
- movdqa [esp+640-336], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-528], xmm6
- movdqa xmm6, [esp+640-368]
- paddw xmm6, xmm7
- movdqa xmm7, xmm1
- psraw xmm4, 3
- pand xmm4, [esp+640-544]
- paddw xmm7, xmm5
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
-
- paddw xmm5, xmm1
- psraw xmm6, 2
- pand xmm7, xmm6
-
- movdqa xmm6, [esp+640-384]
- movdqa [esp+640-64], xmm7
- movdqa xmm7, [esp+640-560]
- pandn xmm7, xmm6
- movdqa [esp+640-304], xmm7
- movdqa xmm7, [esp+640-560]
- movdqa [esp+640-528], xmm7
- movdqa xmm7, [esp+640-416]
- paddw xmm7, xmm6
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pand xmm5, xmm7
- movdqa [esp+640-32], xmm5
-
- movdqa xmm5, [esp+640-544]
- movdqa [esp+640-528], xmm5
- movdqa xmm5, [esp+640-480]
- movdqa xmm7, xmm5
- paddw xmm7, xmm5
- movdqa xmm5, xmm1
- paddw xmm5, xmm6
- paddw xmm6, [esp+640-592]
- paddw xmm7, xmm5
- paddw xmm7, [esp+640-624]
- movdqa xmm5, [esp+640-528]
- psraw xmm7, 2
- pandn xmm5, xmm7
- movdqa xmm7, [esp+640-480]
- paddw xmm7, xmm1
- paddw xmm7, [esp+640-400]
- movdqa xmm1, [esp+640-544]
- movdqa [esp+640-352], xmm5
- movdqa xmm5, [esp+640-368]
- psllw xmm7, 1
- paddw xmm7, xmm6
- paddw xmm5, xmm7
-
- movdqa xmm7, [esp+640-400]
- psraw xmm5, 3
- pand xmm1, xmm5
- movdqa xmm5, [esp+640-480]
- movdqa [esp+640-96], xmm1
- movdqa xmm1, [esp+640-560]
- movdqa [esp+640-528], xmm1
- movdqa xmm1, [esp+640-384]
- movdqa xmm6, xmm1
- paddw xmm6, xmm1
- paddw xmm1, [esp+640-400]
- paddw xmm1, [esp+640-144]
- paddw xmm7, xmm5
- paddw xmm5, [esp+640-592]
- paddw xmm6, xmm7
- paddw xmm6, [esp+640-624]
- movdqa xmm7, [esp+640-528]
- psraw xmm6, 2
- psllw xmm1, 1
- paddw xmm1, xmm5
-
- movdqa xmm5, [esp+656-272]
- pandn xmm7, xmm6
- movdqa xmm6, [esp+640-416]
- paddw xmm6, xmm1
- movdqa xmm1, [esp+640-560]
- psraw xmm6, 3
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+704-272]
- movdqa [esp+640-128], xmm1
- movdqa xmm1, [esp+672-272]
- punpckhbw xmm1, xmm2
- movdqa [esp+640-448], xmm1
- movdqa xmm1, [esp+688-272]
- punpckhbw xmm1, xmm2
- punpckhbw xmm6, xmm2
- movdqa [esp+640-288], xmm7
- punpckhbw xmm5, xmm2
- movdqa [esp+640-496], xmm1
- movdqa [esp+640-432], xmm6
-
- movdqa xmm7, [esp+720-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-464], xmm7
-
- movdqa xmm7, [esp+736-272]
- punpckhbw xmm7, xmm2
- movdqa [esp+640-528], xmm7
-
- movdqa xmm7, xmm6
-
- psubw xmm6, [esp+640-464]
- psubw xmm7, xmm1
- pabsw xmm7, xmm7
- movdqa [esp+640-560], xmm7
- por xmm4, [esp+640-16]
- pabsw xmm6, xmm6
- movdqa xmm7, xmm1
- psubw xmm7, [esp+640-448]
-
- movdqa xmm1, [esp+640-512]
- pabsw xmm7, xmm7
- pcmpgtw xmm1, xmm7
- movdqa xmm7, [esp+640-512]
- pcmpgtw xmm7, xmm6
- movdqa xmm6, [esp+640-320]
- pand xmm1, xmm7
- movdqa xmm7, [esp+640-560]
- pcmpgtw xmm6, xmm7
- pand xmm1, xmm6
-
- movdqa xmm6, [esp+640-576]
- pcmpgtw xmm6, xmm7
-
- movdqa xmm7, [esp+640-496]
- punpckhbw xmm3, xmm2
- movdqa [esp+640-560], xmm6
- movdqa xmm6, [esp+640-512]
- psubw xmm7, xmm5
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
-
- pand xmm6, [esp+640-560]
- movdqa xmm7, [esp+640-432]
- psubw xmm7, [esp+640-528]
-
- psllw xmm3, 1
- movdqa [esp+640-544], xmm6
- movdqa xmm6, [esp+640-512]
-
- movdqa xmm2, [esp+640-544]
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, xmm5
- paddw xmm3, [esp+640-448]
- paddw xmm3, [esp+640-496]
- pabsw xmm7, xmm7
- pcmpgtw xmm6, xmm7
- pand xmm6, [esp+640-560]
- movdqa [esp+640-560], xmm6
-
- movdqa xmm6, xmm0
- pand xmm6, xmm4
- movdqa xmm4, xmm0
- pandn xmm4, [esp+640-368]
- por xmm6, xmm4
- movdqa xmm4, [esp+640-432]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-592]
- psraw xmm3, 3
- pand xmm3, xmm2
- pandn xmm2, xmm5
- por xmm3, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm3
- movdqa xmm3, [esp+640-64]
- por xmm3, [esp+640-336]
- movdqa xmm2, xmm1
- pandn xmm2, xmm5
- por xmm7, xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-480]
- por xmm2, xmm3
- packuswb xmm6, xmm7
- movdqa [esp+640-336], xmm2
- movdqa [esp+656-272], xmm6
- movdqa xmm6, [esp+640-544]
- movdqa xmm2, xmm5
- paddw xmm2, [esp+640-448]
- movdqa xmm3, xmm1
- movdqa xmm7, [esp+640-496]
- paddw xmm7, xmm4
- paddw xmm2, xmm7
- paddw xmm2, [esp+640-624]
- movdqa xmm7, [esp+640-544]
- psraw xmm2, 2
- pand xmm6, xmm2
- movdqa xmm2, [esp+640-448]
- pandn xmm7, xmm2
- por xmm6, xmm7
- pand xmm3, xmm6
- movdqa xmm6, xmm1
- pandn xmm6, xmm2
- paddw xmm2, [esp+640-496]
- paddw xmm2, xmm4
- por xmm3, xmm6
- movdqa xmm6, [esp+640-336]
- packuswb xmm6, xmm3
- psllw xmm2, 1
- movdqa [esp+672-272], xmm6
- movdqa xmm6, [esp+640-96]
- por xmm6, [esp+640-352]
-
- movdqa xmm3, xmm0
- pand xmm3, xmm6
- movdqa xmm6, xmm0
- pandn xmm6, [esp+640-144]
- por xmm3, xmm6
- movdqa xmm6, [esp+640-544]
- movdqa [esp+640-352], xmm3
- movdqa xmm3, [esp+640-464]
- paddw xmm3, [esp+640-592]
- paddw xmm2, xmm3
- movdqa xmm3, [esp+640-448]
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-496]
- psraw xmm5, 3
- pand xmm6, xmm5
- movdqa xmm5, [esp+640-464]
- paddw xmm2, xmm5
- paddw xmm5, [esp+640-432]
- movdqa xmm4, xmm3
- paddw xmm4, xmm3
- paddw xmm4, xmm2
- paddw xmm4, [esp+640-624]
- movdqa xmm2, [esp+640-544]
- paddw xmm3, [esp+640-592]
- psraw xmm4, 2
- pandn xmm2, xmm4
- por xmm6, xmm2
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-496]
- movdqa xmm2, xmm1
- pandn xmm2, xmm6
- por xmm7, xmm2
- movdqa xmm2, [esp+640-352]
- packuswb xmm2, xmm7
- movdqa [esp+688-272], xmm2
- movdqa xmm2, [esp+640-128]
- por xmm2, [esp+640-288]
-
- movdqa xmm4, xmm0
- pand xmm4, xmm2
- paddw xmm5, xmm6
- movdqa xmm2, xmm0
- pandn xmm2, [esp+640-400]
- por xmm4, xmm2
- movdqa xmm2, [esp+640-528]
- psllw xmm5, 1
- paddw xmm5, xmm3
- movdqa xmm3, [esp+640-560]
- paddw xmm2, xmm5
- psraw xmm2, 3
- movdqa [esp+640-288], xmm4
- movdqa xmm4, [esp+640-560]
- pand xmm4, xmm2
- movdqa xmm2, [esp+640-464]
- movdqa xmm5, xmm2
- paddw xmm5, xmm2
- movdqa xmm2, [esp+640-432]
- paddw xmm2, [esp+640-448]
- movdqa xmm7, xmm1
- paddw xmm5, xmm2
- paddw xmm5, [esp+640-624]
- movdqa xmm6, [esp+640-560]
- psraw xmm5, 2
- pandn xmm3, xmm5
- por xmm4, xmm3
- movdqa xmm3, [esp+640-32]
- por xmm3, [esp+640-304]
- pand xmm7, xmm4
- movdqa xmm4, [esp+640-432]
- movdqa xmm5, [esp+640-464]
- movdqa xmm2, xmm1
- pandn xmm2, xmm4
- paddw xmm4, [esp+640-496]
- por xmm7, xmm2
- movdqa xmm2, [esp+640-288]
- packuswb xmm2, xmm7
- movdqa [esp+704-272], xmm2
-
- movdqa xmm2, xmm0
- pand xmm2, xmm3
- movdqa xmm3, xmm0
- pandn xmm3, [esp+640-384]
- por xmm2, xmm3
- movdqa [esp+640-304], xmm2
- movdqa xmm2, [esp+640-528]
- movdqa xmm3, xmm2
- paddw xmm3, [esp+640-464]
- paddw xmm3, xmm4
- paddw xmm3, [esp+640-624]
- psraw xmm3, 2
- pand xmm6, xmm3
- movdqa xmm3, [esp+640-560]
- movdqa xmm4, xmm3
- pandn xmm4, xmm5
- por xmm6, xmm4
- movdqa xmm7, xmm1
- pand xmm7, xmm6
- movdqa xmm6, [esp+640-304]
- movdqa xmm4, xmm1
- pandn xmm4, xmm5
- por xmm7, xmm4
-
- movdqa xmm4, xmm0
- pandn xmm0, [esp+640-416]
- packuswb xmm6, xmm7
- movdqa xmm7, [esp+640-112]
- por xmm7, [esp+640-80]
- pand xmm4, xmm7
- por xmm4, xmm0
- movdqa xmm0, [esp+752-272]
- punpckhbw xmm0, [esp+640-48]
- psllw xmm0, 1
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm2
- paddw xmm0, xmm5
- paddw xmm0, [esp+640-432]
- paddw xmm0, [esp+640-496]
- paddw xmm0, [esp+640-592]
- psraw xmm0, 3
- pand xmm0, xmm3
- movdqa xmm7, xmm1
- pandn xmm3, xmm2
- por xmm0, xmm3
- pand xmm7, xmm0
-
- movdqa xmm0, [esp+656-272]
- movdqa [edx], xmm0
-
- movdqa xmm0, [esp+672-272]
-
- mov edx, dword [esp+640-596]
- movdqa [esi], xmm0
- movdqa xmm0, [esp+688-272]
- movdqa [edi], xmm0
- movdqa xmm0, [esp+704-272]
-
- pop edi
- pandn xmm1, xmm2
- movdqa [eax], xmm0
- por xmm7, xmm1
- pop esi
- packuswb xmm4, xmm7
- movdqa [edx], xmm6
- movdqa [ecx], xmm4
- pop ebx
- mov esp, ebp
- pop ebp
- ret
-
-
-;********************************************************************************
-;
-; void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
-;
-;********************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeH2V_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeH2V_sse2:
- push ebp
- push ebx
- mov ebp, esp
- and esp,0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 0Ch]
- mov ecx, [ebp + 10h]
- lea edx, [eax + ecx * 8]
- lea ebx, [ecx*3]
-
- movq xmm0, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm0, xmm7
- movq xmm1, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm1, xmm7
- movq xmm2, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm2, xmm7
- movq xmm3, [eax + ebx]
- movq xmm7, [edx + ebx]
- punpcklqdq xmm3, xmm7
-
- lea eax, [eax + ecx * 4]
- lea edx, [edx + ecx * 4]
- movq xmm4, [eax]
- movq xmm7, [edx]
- punpcklqdq xmm4, xmm7
- movq xmm5, [eax + ecx]
- movq xmm7, [edx + ecx]
- punpcklqdq xmm5, xmm7
- movq xmm6, [eax + ecx*2]
- movq xmm7, [edx + ecx*2]
- punpcklqdq xmm6, xmm7
-
- movdqa [esp], xmm0
- movq xmm7, [eax + ebx]
- movq xmm0, [edx + ebx]
- punpcklqdq xmm7, xmm0
- movdqa xmm0, [esp]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- mov eax, [ebp + 14h]
- movdqa [eax], xmm4
- movdqa [eax + 10h], xmm2
- movdqa [eax + 20h], xmm3
- movdqa [eax + 30h], xmm7
- movdqa [eax + 40h], xmm5
- movdqa [eax + 50h], xmm1
- movdqa [eax + 60h], xmm6
- movdqa [eax + 70h], xmm0
-
- mov esp, ebp
- pop ebx
- pop ebp
- ret
-
-
-
-;*******************************************************************************************
-;
-; void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN DeblockLumaTransposeV2H_sse2
-
-ALIGN 16
-
-DeblockLumaTransposeV2H_sse2:
- push ebp
- mov ebp, esp
-
- and esp, 0FFFFFFF0h
- sub esp, 10h
-
- mov eax, [ebp + 10h]
- mov ecx, [ebp + 0Ch]
- mov edx, [ebp + 08h]
-
- movdqa xmm0, [eax]
- movdqa xmm1, [eax + 10h]
- movdqa xmm2, [eax + 20h]
- movdqa xmm3, [eax + 30h]
- movdqa xmm4, [eax + 40h]
- movdqa xmm5, [eax + 50h]
- movdqa xmm6, [eax + 60h]
- movdqa xmm7, [eax + 70h]
-
- SSE2_TransTwo8x8B xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
- ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-
- lea eax, [ecx * 3]
-
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
- psrldq xmm4, 8
- psrldq xmm2, 8
- psrldq xmm3, 8
- psrldq xmm7, 8
- psrldq xmm5, 8
- psrldq xmm1, 8
- psrldq xmm6, 8
- psrldq xmm0, 8
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm4
- movq [edx + ecx], xmm2
- movq [edx + ecx*2], xmm3
- movq [edx + eax], xmm7
-
- lea edx, [edx + ecx*4]
- movq [edx], xmm5
- movq [edx + ecx], xmm1
- movq [edx + ecx*2], xmm6
- movq [edx + eax], xmm0
-
-
- mov esp, ebp
- pop ebp
- ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ /dev/null
@@ -1,653 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* expand_picture.asm
-;*
-;* Abstract
-;* mmxext/sse for expand_frame
-;*
-;* History
-;* 09/25/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-;%define PADDING_SIZE_ASM 32 ; PADDING_LENGTH
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-
-SECTION .text
-
-WELS_EXTERN ExpandPictureLuma_sse2
-WELS_EXTERN ExpandPictureChromaAlign_sse2 ; for chroma alignment
-WELS_EXTERN ExpandPictureChromaUnalign_sse2 ; for chroma unalignment
-
-;;;;;;;expanding result;;;;;;;
-
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;aaaa|attttttttttttttttb|bbbb
-;----------------------------
-;aaaa|attttttttttttttttb|bbbb
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;llll|l r|rrrr
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;----------------------------
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-;cccc|ceeeeeeeeeeeeeeeed|dddd
-
-%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+2*%2]
- movq [%1], %3
- movq [%1+%2], %3
- lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
- movdq%4 [%1], %3 ; top(bottom)_0
- movdq%4 [%1+%2], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdq%4 [%1], %3 ; top(bottom)_2
- movdq%4 [%1+%2], %3 ; top(bottom)_3
- lea %1, [%1+%2]
-%endmacro
-
-%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+2*%2]
-%endmacro
-
-%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
- movdqa [%1], %3 ; top(bottom)_0
- movdqa [%1+16], %3 ; top(bottom)_0
- movdqa [%1+%2], %3 ; top(bottom)_1
- movdqa [%1+%2+16], %3 ; top(bottom)_1
- lea %1, [%1+2*%2]
- movdqa [%1], %3 ; top(bottom)_2
- movdqa [%1+16], %3 ; top(bottom)_2
- movdqa [%1+%2], %3 ; top(bottom)_3
- movdqa [%1+%2+16], %3 ; top(bottom)_3
- lea %1, [%1+%2]
-%endmacro
-
-%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
- ; ebx [width/16(8)]
- ; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16) ; top
- ; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16) ; bottom
-
-%if %1 == 32 ; for luma
- sar ebx, 04h ; width / 16(8) pixels
-.top_bottom_loops:
- ; top
- movdqa xmm0, [esi] ; first line of picture pData
- mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
- ; bottom
- movdqa xmm1, [eax] ; last line of picture pData
- mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
- lea esi, [esi+16] ; top pSrc
- lea edi, [edi+16] ; top dst
- lea eax, [eax+16] ; bottom pSrc
- lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
- dec ebx
- jnz near .top_bottom_loops
-%elif %1 == 16 ; for chroma ??
- mov edx, ebx
- sar ebx, 04h ; (width / 16) pixels
-.top_bottom_loops:
- ; top
- movdqa xmm0, [esi] ; first line of picture pData
- mov_line_16x4_sse2 edi, ecx, xmm0, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_16x4_sse2 edi, ecx, xmm0, a
- mov_line_end16x4_sse2 edi, ecx, xmm0, a
-
- ; bottom
- movdqa xmm1, [eax] ; last line of picture pData
- mov_line_16x4_sse2 ebp, ecx, xmm1, a ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_16x4_sse2 ebp, ecx, xmm1, a
- mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-
- lea esi, [esi+16] ; top pSrc
- lea edi, [edi+16] ; top dst
- lea eax, [eax+16] ; bottom pSrc
- lea ebp, [ebp+16] ; bottom dst
- neg ecx ; positive/negative stride need for next loop?
-
- dec ebx
- jnz near .top_bottom_loops
-
- ; for remaining 8 bytes
- and edx, 0fh ; any 8 bytes left?
- test edx, edx
- jz near .to_be_continued ; no left to exit here
-
- ; top
- movq mm0, [esi] ; remained 8 byte
- mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- mov_line_8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- mov_line_end8x4_mmx edi, ecx, mm0 ; dst, stride, mm?
- ; bottom
- movq mm1, [eax]
- mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- mov_line_8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- mov_line_end8x4_mmx ebp, ecx, mm1 ; dst, stride, mm?
- WELSEMMS
-
-.to_be_continued:
-%endif
-%endmacro
-
-%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; ecx [height]
- ; esi [pSrc+0], edi [pSrc-32], edx [stride], 32(16) ; left
- ; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16) ; right
-; xor eax, eax ; for pixel pData (uint8_t) ; make sure eax=0 at least high 24 bits of eax = 0
-
-%if %1 == 32 ; for luma
-.left_right_loops:
- ; left
- mov al, byte [esi] ; pixel pData for left border
- butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
- movdqa [edi+16], xmm0
-
- ; right
- mov al, byte [ebx]
- butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [ebp], xmm1
- movdqa [ebp+16], xmm1
-
- lea esi, [esi+edx] ; left pSrc
- lea edi, [edi+edx] ; left dst
- lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
- dec ecx
- jnz near .left_right_loops
-%elif %1 == 16 ; for chroma ??
-.left_right_loops:
- ; left
- mov al, byte [esi] ; pixel pData for left border
- butterfly_1to16_sse xmm0, xmm1, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdqa [edi], xmm0
-
- ; right
- mov al, byte [ebx]
- butterfly_1to16_sse xmm1, xmm2, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- movdq%2 [ebp], xmm1 ; might not be aligned 16 bytes in case chroma planes
-
- lea esi, [esi+edx] ; left pSrc
- lea edi, [edi+edx] ; left dst
- lea ebx, [ebx+edx] ; right pSrc
- lea ebp, [ebp+edx] ; right dst
-
- dec ecx
- jnz near .left_right_loops
-%endif
-%endmacro
-
-%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
- ; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
- ; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
-%if %1 == 32 ; luma
- ; TL
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
- mov_line_end32x4_sse2 edi, ecx, xmm3 ; dst, stride, xmm?
-
- ; TR
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
- mov_line_end32x4_sse2 ebp, ecx, xmm4 ; dst, stride, xmm?
-
- ; BL
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
- mov_line_end32x4_sse2 eax, ecx, xmm5 ; dst, stride, xmm?
-
- ; BR
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
- mov_line_end32x4_sse2 ebx, ecx, xmm6 ; dst, stride, xmm?
-%elif %1 == 16 ; chroma
- ; TL
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 edi, ecx, xmm3, a ; dst, stride, xmm?
-
- ; TR
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 ebp, ecx, xmm4, %2 ; dst, stride, xmm?
-
- ; BL
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
- mov_line_end16x4_sse2 eax, ecx, xmm5, a ; dst, stride, xmm?
-
- ; BR
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
- mov_line_end16x4_sse2 ebx, ecx, xmm6, %2 ; dst, stride, xmm?
-%endif
-%endmacro
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureLuma_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
-;***********************************************************************----------------
-ExpandPictureLuma_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- ; for both top and bottom border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov edx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov eax, [esp+36] ; height
- ; also prepare for cross border pData top-left: xmm3
-; xor ecx, ecx
- mov cl, byte [esi]
- butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; load top border
- mov ecx, edx ; stride
- neg ecx ; -stride
- lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
- dec eax ; h-1
- imul eax, edx ; (h-1)*stride
- lea eax, [esi+eax] ; last line of picture pData
- sal edx, 05h ; 32*stride
- lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 32 * stride
- ; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
- dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
-; xor edx, edx
- mov dl, byte [eax] ; bottom-left
- butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- mov dl, byte [ebx] ; bottom-right
- butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
- mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 32
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst: left border pSrc
- mov edx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov ecx, [esp+36] ; height
- ; load left border
- mov eax, -32 ; luma=-32, chroma=-16
- lea edi, [esi+eax] ; left border dst
- dec ebx
- lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
- ; prepare for cross border pData: top-right with xmm4
-; xor eax, eax
- mov al, byte [ebx] ; top-right
- butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for left & right border expanding
- exp_left_right_sse2 32, a
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov ecx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov edx, [esp+36] ; height
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
- mov eax, -32 ; luma=-32, chroma=-16
- neg ecx ; -stride
- lea edi, [esi+eax]
- lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
- lea ebp, [ebp+ecx] ; last line of top-right border
- add edx, 32 ; height+32(16), luma=32, chroma=16
- mov ecx, [esp+28] ; stride
- imul edx, ecx ; (height+32(16)) * stride
- lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
- neg ecx ; -stride
- ; for left & right border expanding
- exp_cross_sse2 32, a
-
-; sfence ; commit cache write back memory
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
-
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
-;***********************************************************************----------------
-ExpandPictureChromaAlign_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- ; for both top and bottom border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov edx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov eax, [esp+36] ; height
- ; also prepare for cross border pData top-left: xmm3
-; xor ecx, ecx
- mov cl, byte [esi]
- butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; load top border
- mov ecx, edx ; stride
- neg ecx ; -stride
- lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
- dec eax ; h-1
- imul eax, edx ; (h-1)*stride
- lea eax, [esi+eax] ; last line of picture pData
- sal edx, 04h ; 16*stride
- lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 16 * stride
- ; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
- dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
-; xor edx, edx
- mov dl, byte [eax] ; bottom-left
- butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- mov dl, byte [ebx] ; bottom-right
- butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
- mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 16
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst: left border pSrc
- mov edx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov ecx, [esp+36] ; height
- ; load left border
- mov eax, -16 ; luma=-32, chroma=-16
- lea edi, [esi+eax] ; left border dst
- dec ebx
- lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
- ; prepare for cross border pData: top-right with xmm4
-; xor eax, eax
- mov al, byte [ebx] ; top-right
- butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for left & right border expanding
- exp_left_right_sse2 16, a
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov ecx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov edx, [esp+36] ; height
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
- mov eax, -16 ; chroma=-16
- neg ecx ; -stride
- lea edi, [esi+eax]
- lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
- lea ebp, [ebp+ecx] ; last line of top-right border
- mov ecx, [esp+28] ; stride
- add edx, 16 ; height+16, luma=32, chroma=16
- imul edx, ecx ; (height+16) * stride
- lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
- neg ecx ; -stride
- ; for left & right border expanding
- exp_cross_sse2 16, a
-
-; sfence ; commit cache write back memory
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
-
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
-; const int32_t iStride,
-; const int32_t iWidth,
-; const int32_t iHeight );
-;***********************************************************************----------------
-ExpandPictureChromaUnalign_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- ; for both top and bottom border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov edx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov eax, [esp+36] ; height
- ; also prepare for cross border pData top-left: xmm3
-; xor ecx, ecx
- mov cl, byte [esi]
- butterfly_1to16_sse xmm3, xmm4, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; load top border
- mov ecx, edx ; stride
- neg ecx ; -stride
- lea edi, [esi+ecx] ; last line of top border
- ; load bottom border
- dec eax ; h-1
- imul eax, edx ; (h-1)*stride
- lea eax, [esi+eax] ; last line of picture pData
- sal edx, 04h ; 16*stride
- lea ebp, [eax+edx] ; last line of bottom border, (h-1)*stride + 16 * stride
- ; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
- dec ebx ; width-1
- lea ebx, [eax+ebx] ; dst[w-1][h-1]
-; xor edx, edx
- mov dl, byte [eax] ; bottom-left
- butterfly_1to16_sse xmm5, xmm6, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- mov dl, byte [ebx] ; bottom-right
- butterfly_1to16_sse xmm6, xmm4, d ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for top & bottom expanding
- mov ebx, [esp+32] ; width
- exp_top_bottom_sse2 16
-
- ; for both left and right border
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst: left border pSrc
- mov edx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov ecx, [esp+36] ; height
- ; load left border
- mov eax, -16 ; luma=-32, chroma=-16
- lea edi, [esi+eax] ; left border dst
- dec ebx
- lea ebx, [esi+ebx] ; right border pSrc, (p_dst + width - 1)
- lea ebp, [ebx+1] ; right border dst
- ; prepare for cross border pData: top-right with xmm4
-; xor eax, eax
- mov al, byte [ebx] ; top-right
- butterfly_1to16_sse xmm4, xmm0, a ; dst, tmp, pSrc [generic register name: a/b/c/d]
- ; for left & right border expanding
- exp_left_right_sse2 16, u
-
- ; for cross border [top-left, top-right, bottom-left, bottom-right]
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- mov esi, [esp+24] ; p_dst
- mov ecx, [esp+28] ; stride
- mov ebx, [esp+32] ; width
- mov edx, [esp+36] ; height
- ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
- neg ecx ; -stride
- mov eax, -16 ; chroma=-16
- lea edi, [esi+eax]
- lea edi, [edi+ecx] ; last line of top-left border
- lea ebp, [esi+ebx]
- lea ebp, [ebp+ecx] ; last line of top-right border
- mov ecx, [esp+28] ; stride
- add edx, 16 ; height+16, luma=32, chroma=16
- imul edx, ecx ; (height+16) * stride
- lea eax, [edi+edx] ; last line of bottom-left border
- lea ebx, [ebp+edx] ; last line of bottom-right border
- neg ecx ; -stride
- ; for left & right border expanding
- exp_cross_sse2 16, u
-
-; sfence ; commit cache write back memory
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
-
- ret
-
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -41,7 +41,6 @@
;*************************************************************************/
%include "asm_inc.asm"
-BITS 32
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
@@ -177,11 +176,11 @@
%endmacro
%macro LOAD_2_LEFT_AND_ADD 0
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01]
- add ebx, edx
- movzx edx, byte [eax+ecx-0x01]
- add ebx, edx
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01]
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01]
+ add r3, r4
%endmacro
;***********************************************************************
@@ -201,34 +200,36 @@
; pred must align to 16
;***********************************************************************
WelsI4x4LumaPredH_sse2:
- mov eax, [esp+8] ;pRef
- mov ecx, [esp+12] ;stride
-
- movzx edx, byte [eax-1]
- movd xmm0, edx
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movzx r3, byte [r1-1]
+ movd xmm0, r3d
pmuludq xmm0, [mmx_01bytes]
- movzx edx, byte [eax+ecx-1]
- movd xmm1, edx
+ movzx r3, byte [r1+r2-1]
+ movd xmm1, r3d
pmuludq xmm1, [mmx_01bytes]
unpcklps xmm0, xmm1
- lea eax, [eax+ecx*2]
- movzx edx, byte [eax-1]
- movd xmm2, edx
+ lea r1, [r1+r2*2]
+ movzx r3, byte [r1-1]
+ movd xmm2, r3d
pmuludq xmm2, [mmx_01bytes]
- movzx edx, byte [eax+ecx-1]
- movd xmm3, edx
+ movzx r3, byte [r1+r2-1]
+ movd xmm3, r3d
pmuludq xmm3, [mmx_01bytes]
unpcklps xmm2, xmm3
unpcklpd xmm0, xmm2
- mov edx, [esp+4] ;pred
- movdqa [edx], xmm0
-
+ movdqa [r0], xmm0
+ pop r3
ret
;***********************************************************************
@@ -235,20 +236,27 @@
; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
WelsI16x16LumaPredPlane_sse2:
-%define pushsize 4
- push esi
- mov esi, [esp + pushsize + 8]
- mov ecx, [esp + pushsize + 12]
- sub esi, 1
- sub esi, ecx
+ ;%define pushsize 4
+ ;push esi
+ ;mov esi, [esp + pushsize + 8]
+ ;mov ecx, [esp + pushsize + 12]
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, 1
+ sub r1, r2
;for H
pxor xmm7, xmm7
- movq xmm0, [esi]
+ movq xmm0, [r1]
movdqa xmm5, [sse2_plane_dec]
punpcklbw xmm0, xmm7
pmullw xmm0, xmm5
- movq xmm1, [esi + 9]
+ movq xmm1, [r1 + 9]
movdqa xmm6, [sse2_plane_inc]
punpcklbw xmm1, xmm7
pmullw xmm1, xmm6
@@ -255,25 +263,25 @@
psubw xmm1, xmm0
SUMW_HORIZON xmm1,xmm0,xmm2
- movd eax, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
- movsx eax, ax
- imul eax, 5
- add eax, 32
- sar eax, 6 ; b = (5 * H + 32) >> 6;
- SSE2_Copy8Times xmm1, eax ; xmm1 = b,b,b,b,b,b,b,b
+ movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
- movzx edx, BYTE [esi+16]
- sub esi, 3
- LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, esi, ecx
+ movzx r4, BYTE [r1+16]
+ sub r1, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
- add esi, 3
- movzx eax, BYTE [esi+8*ecx]
- add edx, eax
- shl edx, 4 ; a = (left[15*stride] + top[15]) << 4;
+ add r1, 3
+ movzx r3, BYTE [r1+8*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
- sub esi, 3
- add esi, ecx
- LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, esi, ecx
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
pxor xmm4, xmm4
punpckhbw xmm0, xmm4
pmullw xmm0, xmm5
@@ -282,21 +290,20 @@
psubw xmm7, xmm0
SUMW_HORIZON xmm7,xmm0,xmm2
- movd eax, xmm7 ; V
- movsx eax, ax
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
- imul eax, 5
- add eax, 32
- sar eax, 6 ; c = (5 * V + 32) >> 6;
- SSE2_Copy8Times xmm4, eax ; xmm4 = c,c,c,c,c,c,c,c
+ ;mov esi, [esp + pushsize + 4]
+ add r4, 16
+ imul r3, -7
+ add r3, r4 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- mov esi, [esp + pushsize + 4]
- add edx, 16
- imul eax, -7
- add edx, eax ; s = a + 16 + (-7)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
- xor eax, eax
+ xor r3, r3
movdqa xmm5, [sse2_plane_inc_minus]
get_i16x16_luma_pred_plane_sse2_1:
@@ -309,51 +316,56 @@
paddw xmm3, xmm0
psraw xmm3, 5
packuswb xmm2, xmm3
- movdqa [esi], xmm2
+ movdqa [r0], xmm2
paddw xmm0, xmm4
- add esi, 16
- inc eax
- cmp eax, 16
+ add r0, 16
+ inc r3
+ cmp r3, 16
jnz get_i16x16_luma_pred_plane_sse2_1
-
- pop esi
+ pop r4
+ pop r3
ret
-
-
;***********************************************************************
; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
;***********************************************************************
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
- lea eax, [eax+ecx*2]
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx+%1], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- ; COPY_16_TIMES eax + ecx, xmm0
- movdqa [edx+%1+0x10], xmm0
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+ add r0, 16
+ add r1, r2
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
%endmacro
WELS_EXTERN WelsI16x16LumaPredH_sse2
WelsI16x16LumaPredH_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
- SSE2_PRED_H_16X16_TWO_LINE 0x40
- SSE2_PRED_H_16X16_TWO_LINE 0x60
- SSE2_PRED_H_16X16_TWO_LINE 0x80
- SSE2_PRED_H_16X16_TWO_LINE 0xa0
- SSE2_PRED_H_16X16_TWO_LINE 0xc0
- SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ dec r1
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ pop r3
ret
;***********************************************************************
@@ -361,30 +373,34 @@
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredV_sse2
WelsI16x16LumaPredV_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
+ ;mov edx, [esp+4] ; pred
+ ;mov eax, [esp+8] ; pRef
+ ;mov ecx, [esp+12] ; stride
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movdqa xmm0, [r1]
- sub eax, ecx
- movdqa xmm0, [eax]
+ movdqa [r0], xmm0
+ movdqa [r0+10h], xmm0
+ movdqa [r0+20h], xmm0
+ movdqa [r0+30h], xmm0
+ movdqa [r0+40h], xmm0
+ movdqa [r0+50h], xmm0
+ movdqa [r0+60h], xmm0
+ movdqa [r0+70h], xmm0
+ movdqa [r0+80h], xmm0
+ movdqa [r0+90h], xmm0
+ movdqa [r0+160], xmm0
+ movdqa [r0+176], xmm0
+ movdqa [r0+192], xmm0
+ movdqa [r0+208], xmm0
+ movdqa [r0+224], xmm0
+ movdqa [r0+240], xmm0
- movdqa [edx], xmm0
- movdqa [edx+10h], xmm0
- movdqa [edx+20h], xmm0
- movdqa [edx+30h], xmm0
- movdqa [edx+40h], xmm0
- movdqa [edx+50h], xmm0
- movdqa [edx+60h], xmm0
- movdqa [edx+70h], xmm0
- movdqa [edx+80h], xmm0
- movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
- movdqa [edx+176], xmm0
- movdqa [edx+192], xmm0
- movdqa [edx+208], xmm0
- movdqa [edx+224], xmm0
- movdqa [edx+240], xmm0
-
ret
;***********************************************************************
@@ -392,19 +408,26 @@
;***********************************************************************
WELS_EXTERN WelsIChromaPredPlane_sse2
WelsIChromaPredPlane_sse2:
-%define pushsize 4
- push esi
- mov esi, [esp + pushsize + 8] ;pRef
- mov ecx, [esp + pushsize + 12] ;stride
- sub esi, 1
- sub esi, ecx
+ ;%define pushsize 4
+ ;push esi
+ ;mov esi, [esp + pushsize + 8] ;pRef
+ ;mov ecx, [esp + pushsize + 12] ;stride
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, 1
+ sub r1, r2
pxor mm7, mm7
- movq mm0, [esi]
+ movq mm0, [r1]
movq mm5, [sse2_plane_dec_c]
punpcklbw mm0, mm7
pmullw mm0, mm5
- movq mm1, [esi + 5]
+ movq mm1, [r1 + 5]
movq mm6, [sse2_plane_inc_c]
punpcklbw mm1, mm7
pmullw mm1, mm6
@@ -413,25 +436,25 @@
movq2dq xmm1, mm1
pxor xmm2, xmm2
SUMW_HORIZON xmm1,xmm0,xmm2
- movd eax, xmm1
- movsx eax, ax
- imul eax, 17
- add eax, 16
- sar eax, 5 ; b = (17 * H + 16) >> 5;
- SSE2_Copy8Times xmm1, eax ; mm1 = b,b,b,b,b,b,b,b
+ movd r3d, xmm1
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
- movzx edx, BYTE [esi+8]
- sub esi, 3
- LOAD_COLUMN_C mm0, mm2, mm3, mm4, esi, ecx
+ movzx r3, BYTE [r1+8]
+ sub r1, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
- add esi, 3
- movzx eax, BYTE [esi+4*ecx]
- add edx, eax
- shl edx, 4 ; a = (left[7*stride] + top[7]) << 4;
+ add r1, 3
+ movzx r4, BYTE [r1+4*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
- sub esi, 3
- add esi, ecx
- LOAD_COLUMN_C mm7, mm2, mm3, mm4, esi, ecx
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
pxor mm4, mm4
punpckhbw mm0, mm4
pmullw mm0, mm5
@@ -442,21 +465,20 @@
movq2dq xmm7, mm7
pxor xmm2, xmm2
SUMW_HORIZON xmm7,xmm0,xmm2
- movd eax, xmm7 ; V
- movsx eax, ax
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
- imul eax, 17
- add eax, 16
- sar eax, 5 ; c = (17 * V + 16) >> 5;
- SSE2_Copy8Times xmm4, eax ; mm4 = c,c,c,c,c,c,c,c
+ ;mov esi, [esp + pushsize + 4]
+ add r4, 16
+ imul r3, -3
+ add r3, r4 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
- mov esi, [esp + pushsize + 4]
- add edx, 16
- imul eax, -3
- add edx, eax ; s = a + 16 + (-3)*c
- SSE2_Copy8Times xmm0, edx ; xmm0 = s,s,s,s,s,s,s,s
-
- xor eax, eax
+ xor r3, r3
movdqa xmm5, [sse2_plane_mul_b_c]
get_i_chroma_pred_plane_sse2_1:
@@ -465,14 +487,14 @@
paddw xmm2, xmm0
psraw xmm2, 5
packuswb xmm2, xmm2
- movq [esi], xmm2
+ movq [r0], xmm2
paddw xmm0, xmm4
- add esi, 8
- inc eax
- cmp eax, 8
+ add r0, 8
+ inc r3
+ cmp r3, 8
jnz get_i_chroma_pred_plane_sse2_1
-
- pop esi
+ pop r4
+ pop r3
WELSEMMS
ret
@@ -490,27 +512,31 @@
;
;***********************************************************************
WelsI4x4LumaPredDDR_mmx:
- mov edx,[esp+4] ;pred
- mov eax,[esp+8] ;pRef
- mov ecx,[esp+12] ;stride
-
- movq mm1,[eax+ecx-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
- movq mm2,[eax-8] ;get value of 6 mm2[8] = 6
- sub eax, ecx ;mov eax to above line of current block(postion of 1)
- punpckhbw mm2,[eax-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
- movd mm3,[eax] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ ;mov edx,[esp+4] ;pred
+ ;mov eax,[esp+8] ;pRef
+ ;mov ecx,[esp+12] ;stride
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
+ sub r1, r2 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
psllq mm3,18h ;mm3[5]=[1]
psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
- lea eax,[eax+ecx*2-8h] ;set eax point to 12
- movq mm4,[eax+ecx] ;get value of 16, mm4[8]=[16]
+ lea r1,[r1+r2*2-8h] ;set eax point to 12
+ movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
psrlq mm4,38h ;mm4[1]=[16]
por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
- movq mm4,[eax+ecx*2] ;mm4[8]=[21]
+ movq mm4,[r1+r2*2] ;mm4[8]=[21]
psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
psrlq mm4,38h ;mm4[1]=[21]
por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
@@ -521,13 +547,13 @@
psubusb mm3,mm1 ;decrease 1 from odd bytes
pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
- movd [edx+12],mm2
+ movd [r0+12],mm2
psrlq mm2,8
- movd [edx+8],mm2
+ movd [r0+8],mm2
psrlq mm2,8
- movd [edx+4],mm2
+ movd [r0+4],mm2
psrlq mm2,8
- movd [edx],mm2
+ movd [r0],mm2
WELSEMMS
ret
@@ -545,39 +571,39 @@
;
;***********************************************************************
WelsI4x4LumaPredDc_sse2:
- mov eax,[esp+8] ;pRef
- mov ecx,[esp+12] ;stride
- push ebx
-
- movzx edx, byte [eax-1h]
-
- sub eax, ecx
- movd xmm0, [eax]
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movzx r4, byte [r1-1h]
+ sub r1, r2
+ movd xmm0, [r1]
pxor xmm1, xmm1
psadbw xmm0, xmm1
+ xor r3, r3
+ movd r3d, xmm0
+ add r3, r4
+ movzx r4, byte [r1+r2*2-1h]
+ add r3, r4
- movd ebx, xmm0
- add ebx, edx
+ lea r1, [r1+r2*2-1]
+ movzx r4, byte [r1+r2]
+ add r3, r4
- movzx edx, byte [eax+ecx*2-1h]
- add ebx, edx
+ movzx r4, byte [r1+r2*2]
+ add r3, r4
+ add r3, 4
+ sar r3, 3
+ imul r3, 0x01010101
- lea eax, [eax+ecx*2-1]
- movzx edx, byte [eax+ecx]
- add ebx, edx
-
- movzx edx, byte [eax+ecx*2]
- add ebx, edx
- add ebx, 4
- sar ebx, 3
- imul ebx, 0x01010101
-
- mov edx, [esp+8] ;pred
- movd xmm0, ebx
+ movd xmm0, r3d
pshufd xmm0, xmm0, 0
- movdqa [edx], xmm0
-
- pop ebx
+ movdqa [r0], xmm0
+ pop r4
+ pop r3
ret
ALIGN 16
@@ -596,7 +622,7 @@
%endmacro
%macro MMX_PRED_H_8X8_ONE_LINEE 4
- movq %1, [%3+ecx-8]
+ movq %1, [%3+r2-8]
psrlq %1, 38h
;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
@@ -607,34 +633,38 @@
WELS_EXTERN WelsIChromaPredH_mmx
WelsIChromaPredH_mmx:
- mov edx, [esp+4] ;pred
- mov eax, [esp+8] ;pRef
- mov ecx, [esp+12] ;stride
-
- movq mm0, [eax-8]
+ ;mov edx, [esp+4] ;pred
+ ;mov eax, [esp+8] ;pRef
+ ;mov ecx, [esp+12] ;stride
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movq mm0, [r1-8]
psrlq mm0, 38h
;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
pmullw mm0, [mmx_01bytes]
pshufw mm0, mm0, 0
- movq [edx], mm0
+ movq [r0], mm0
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+8
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
- lea eax,[eax+ecx*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+16
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+24
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
- lea eax,[eax+ecx*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+32
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+40
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
- lea eax,[eax+ecx*2]
- MMX_PRED_H_8X8_ONE_LINE mm0, mm1, eax,edx+48
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
- MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax,edx+56
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
WELSEMMS
ret
@@ -645,14 +675,15 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredV_sse2
WelsI4x4LumaPredV_sse2:
- mov edx, [esp+4] ;pred
- mov eax, [esp+8] ;pRef
- mov ecx, [esp+12] ;stride
-
- sub eax, ecx
- movd xmm0, [eax]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movd xmm0, [r1]
pshufd xmm0, xmm0, 0
- movdqa [edx], xmm0
+ movdqa [r0], xmm0
ret
ALIGN 16
@@ -662,22 +693,21 @@
;***********************************************************************
WELS_EXTERN WelsIChromaPredV_sse2
WelsIChromaPredV_sse2:
- mov edx, [esp+4] ;pred
- mov eax, [esp+8] ;pRef
- mov ecx, [esp+12] ;stride
-
- sub eax, ecx
- movq xmm0, [eax]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq xmm0, [r1]
movdqa xmm1, xmm0
punpcklqdq xmm0, xmm1
-
- movdqa [edx], xmm0
- movdqa [edx+16], xmm0
- movdqa [edx+32], xmm0
- movdqa [edx+48], xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
ret
-
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -710,18 +740,20 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHD_mmx
WelsI4x4LumaPredHD_mmx:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
- sub eax, ecx
- movd mm0, [eax-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
- lea eax, [eax+2*ecx]
- movd mm2, [eax+2*ecx-4]
- punpcklbw mm2, [eax+ecx-4] ; mm2[7] = l2, mm2[6] = l3
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movd mm2, [r1+2*r2-4]
+ punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
psrlq mm2, 20h
pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
@@ -751,17 +783,15 @@
pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
- movd [edx], mm2
- movd [edx+12], mm3
+ movd [r0], mm2
+ movd [r0+12], mm3
psrlq mm3, 10h
- movd [edx+8], mm3
+ movd [r0+8], mm3
psrlq mm3, 10h
- movd [edx+4], mm3
+ movd [r0+4], mm3
WELSEMMS
ret
-
-
ALIGN 16
;***********************************************************************
; lt|t0|t1|t2|t3|
@@ -791,15 +821,16 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredHU_mmx
WelsI4x4LumaPredHU_mmx:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- movd mm0, [eax-4] ; mm0[3] = l0
- punpcklbw mm0, [eax+ecx-4] ; mm0[7] = l1, mm0[6] = l0
- lea eax, [eax+2*ecx]
- movd mm2, [eax-4] ; mm2[3] = l2
- movd mm4, [eax+ecx-4] ; mm4[3] = l3
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movd mm0, [r1-4] ; mm0[3] = l0
+ punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r1, [r1+2*r2]
+ movd mm2, [r1-4] ; mm2[3] = l2
+ movd mm4, [r1+r2-4] ; mm4[3] = l3
punpcklbw mm2, mm4
punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
@@ -832,13 +863,13 @@
punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
psrlq mm4, 20h
- movd [edx+12], mm4
+ movd [r0+12], mm4
- movd [edx], mm1
+ movd [r0], mm1
psrlq mm1, 10h
- movd [edx+4], mm1
+ movd [r0+4], mm1
psrlq mm1, 10h
- movd [edx+8], mm1
+ movd [r0+8], mm1
WELSEMMS
ret
@@ -875,17 +906,19 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVR_mmx
WelsI4x4LumaPredVR_mmx:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
- sub eax, ecx
- movq mm0, [eax-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
- movd mm1, [eax+2*ecx-4]
- punpcklbw mm1, [eax+ecx-4] ; mm1[7] = l0, mm1[6] = l1
- lea eax, [eax+2*ecx]
- movq mm2, [eax+ecx-8] ; mm2[7] = l2
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movq mm2, [r1+r2-8] ; mm2[7] = l2
punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
psrlq mm2, 28h
pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
@@ -909,10 +942,10 @@
movq mm2, mm3
psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
- movd [edx], mm1
+ movd [r0], mm1
psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
- movd [edx+4], mm2
+ movd [r0+4], mm2
movq mm4, mm3
psllq mm4, 20h
@@ -924,11 +957,11 @@
psllq mm1, 8h
pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
- movd [edx+8], mm4
+ movd [r0+8], mm4
psllq mm2, 8h
pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
- movd [edx+12], mm5
+ movd [r0+12], mm5
WELSEMMS
ret
@@ -961,11 +994,13 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredDDL_mmx
WelsI4x4LumaPredDDL_mmx:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
- sub eax, ecx
- movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
@@ -986,13 +1021,13 @@
pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
psrlq mm0, 8h
- movd [edx], mm0
+ movd [r0], mm0
psrlq mm0, 8h
- movd [edx+4], mm0
+ movd [r0+4], mm0
psrlq mm0, 8h
- movd [edx+8], mm0
+ movd [r0+8], mm0
psrlq mm0, 8h
- movd [edx+12], mm0
+ movd [r0+12], mm0
WELSEMMS
ret
@@ -1029,12 +1064,13 @@
;***********************************************************************
WELS_EXTERN WelsI4x4LumaPredVL_mmx
WelsI4x4LumaPredVL_mmx:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- sub eax, ecx
- movq mm0, [eax] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
movq mm1, mm0
movq mm2, mm0
@@ -1052,13 +1088,13 @@
pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
- movd [edx], mm3
+ movd [r0], mm3
psrlq mm3, 8h
- movd [edx+8], mm3
+ movd [r0+8], mm3
- movd [edx+4], mm2
+ movd [r0+4], mm2
psrlq mm2, 8h
- movd [edx+12], mm2
+ movd [r0+12], mm2
WELSEMMS
ret
@@ -1069,41 +1105,38 @@
;***********************************************************************
WELS_EXTERN WelsIChromaPredDc_sse2
WelsIChromaPredDc_sse2:
- push ebx
- mov eax, [esp+12] ; pRef
- mov ecx, [esp+16] ; stride
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1]
- sub eax, ecx
- movq mm0, [eax]
+ movzx r3, byte [r1+r2-0x01] ; l1
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l2
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l3
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l4
+ add r3, r4
+ movd mm1, r3d ; mm1 = l1+l2+l3+l4
- ;xor ebx, ebx
- ;movzx edx, byte [eax+ecx-0x01] ; l1
- movzx ebx, byte [eax+ecx-0x01] ; l1
- ;mov ebx, edx
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l2
- add ebx, edx
- movzx edx, byte [eax+ecx-0x01] ; l3
- add ebx, edx
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l4
- add ebx, edx
- movd mm1, ebx ; mm1 = l1+l2+l3+l4
+ movzx r3, byte [r1+r2-0x01] ; l5
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l6
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l7
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l8
+ add r3, r4
+ movd mm2, r3d ; mm2 = l5+l6+l7+l8
- ;xor ebx, ebx
- ;movzx edx, byte [eax+ecx-0x01] ; l5
- movzx ebx, byte [eax+ecx-0x01] ; l5
- ;mov ebx, edx
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l6
- add ebx, edx
- movzx edx, byte [eax+ecx-0x01] ; l7
- add ebx, edx
- lea eax, [eax+2*ecx]
- movzx edx, byte [eax-0x01] ; l8
- add ebx, edx
- movd mm2, ebx ; mm2 = l5+l6+l7+l8
-
movq mm3, mm0
psrlq mm0, 0x20
psllq mm3, 0x20
@@ -1142,19 +1175,18 @@
psllq mm1, 0x20
pxor mm1, mm2 ; mm2 = m_down
- mov edx, [esp+8] ; pRef
+ movq [r0], mm0
+ movq [r0+0x08], mm0
+ movq [r0+0x10], mm0
+ movq [r0+0x18], mm0
- movq [edx], mm0
- movq [edx+0x08], mm0
- movq [edx+0x10], mm0
- movq [edx+0x18], mm0
+ movq [r0+0x20], mm1
+ movq [r0+0x28], mm1
+ movq [r0+0x30], mm1
+ movq [r0+0x38], mm1
- movq [edx+0x20], mm1
- movq [edx+0x28], mm1
- movq [edx+0x30], mm1
- movq [edx+0x38], mm1
-
- pop ebx
+ pop r4
+ pop r3
WELSEMMS
ret
@@ -1167,12 +1199,15 @@
;***********************************************************************
WELS_EXTERN WelsI16x16LumaPredDc_sse2
WelsI16x16LumaPredDc_sse2:
- push ebx
- mov eax, [esp+12] ; pRef
- mov ecx, [esp+16] ; stride
-
- sub eax, ecx
- movdqa xmm0, [eax] ; read one row
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movdqa xmm0, [r1] ; read one row
pxor xmm1, xmm1
psadbw xmm0, xmm1
movdqa xmm1, xmm0
@@ -1181,13 +1216,10 @@
psrldq xmm0, 0x08
paddw xmm0, xmm1
- ;xor ebx, ebx
- ;movzx edx, byte [eax+ecx-0x01]
- movzx ebx, byte [eax+ecx-0x01]
- ;mov ebx, edx
- movzx edx, byte [eax+2*ecx-0x01]
- add ebx, edx
- lea eax, [eax+ecx]
+ movzx r3, byte [r1+r2-0x01]
+ movzx r4, byte [r1+2*r2-0x01]
+ add r3, r4
+ lea r1, [r1+r2]
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
@@ -1195,33 +1227,32 @@
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
LOAD_2_LEFT_AND_ADD
- add ebx, 0x10
- movd xmm1, ebx
+ add r3, 0x10
+ movd xmm1, r3d
paddw xmm0, xmm1
psrld xmm0, 0x05
pmuludq xmm0, [mmx_01bytes]
pshufd xmm0, xmm0, 0
- mov edx, [esp+8] ; pred
- movdqa [edx], xmm0
- movdqa [edx+0x10], xmm0
- movdqa [edx+0x20], xmm0
- movdqa [edx+0x30], xmm0
- movdqa [edx+0x40], xmm0
- movdqa [edx+0x50], xmm0
- movdqa [edx+0x60], xmm0
- movdqa [edx+0x70], xmm0
- movdqa [edx+0x80], xmm0
- movdqa [edx+0x90], xmm0
- movdqa [edx+0xa0], xmm0
- movdqa [edx+0xb0], xmm0
- movdqa [edx+0xc0], xmm0
- movdqa [edx+0xd0], xmm0
- movdqa [edx+0xe0], xmm0
- movdqa [edx+0xf0], xmm0
+ movdqa [r0], xmm0
+ movdqa [r0+0x10], xmm0
+ movdqa [r0+0x20], xmm0
+ movdqa [r0+0x30], xmm0
+ movdqa [r0+0x40], xmm0
+ movdqa [r0+0x50], xmm0
+ movdqa [r0+0x60], xmm0
+ movdqa [r0+0x70], xmm0
+ movdqa [r0+0x80], xmm0
+ movdqa [r0+0x90], xmm0
+ movdqa [r0+0xa0], xmm0
+ movdqa [r0+0xb0], xmm0
+ movdqa [r0+0xc0], xmm0
+ movdqa [r0+0xd0], xmm0
+ movdqa [r0+0xe0], xmm0
+ movdqa [r0+0xf0], xmm0
- pop ebx
-
+ pop r4
+ pop r3
ret
;***********************************************************************
@@ -1230,6 +1261,7 @@
; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
;
;***********************************************************************
+%ifdef X86_32
WELS_EXTERN WelsSmpleSatdThree4x4_sse2
align 16
WelsSmpleSatdThree4x4_sse2:
@@ -1469,5 +1501,5 @@
pop esi
pop ebx
ret
-
+%endif
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ /dev/null
@@ -1,156 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred_util.asm
-;*
-;* Abstract
-;* mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
-;* WelsFillingPred1to16 etc.
-;*
-;* History
-;* 09/29/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-
-SECTION .text
-
-WELS_EXTERN WelsFillingPred8to16_mmx
-WELS_EXTERN WelsFillingPred8x2to16_mmx
-WELS_EXTERN WelsFillingPred1to16_mmx
-WELS_EXTERN WelsFillingPred8x2to16_sse2
-WELS_EXTERN WelsFillingPred1to16_sse2
-
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred8to16_mmx( uint8_t *pred, uint8_t *v );
-;***********************************************************************----------------
-WelsFillingPred8to16_mmx:
- mov eax, [esp+4] ; pred
- mov ecx, [esp+8] ; v
-
- movq mm0, [ecx]
- movq [eax ], mm0
- movq [eax+8], mm0
-
- WELSEMMS
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred8x2to16_mmx( uint8_t *pred, uint8_t *v );
-;***********************************************************************----------------
-WelsFillingPred8x2to16_mmx:
- mov eax, [esp+4] ; pred
- mov ecx, [esp+8] ; v
-
- movq mm0, [ecx ]
- movq mm1, [ecx+8]
- movq [eax ], mm0
- movq [eax+8], mm1
-
- WELSEMMS
-
- ret
-
-%macro butterfly_1to8_mmx 3 ; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %2, e%3x ; i.e, 1% = eax (=b0)
- pshufw %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred1to16_mmx( uint8_t *pred, const uint8_t v );
-;***********************************************************************----------------
-WelsFillingPred1to16_mmx:
- mov eax, [esp+4] ; pred
-
- mov cl, byte [esp+8] ; v
- butterfly_1to8_mmx mm0, mm1, c ; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-
- movq [eax ], mm0
- movq [eax+8], mm0
-
- WELSEMMS
-
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred8x2to16_sse2( uint8_t *pred, uint8_t *v );
-;***********************************************************************----------------
-WelsFillingPred8x2to16_sse2:
- mov eax, [esp+4] ; pred
- mov ecx, [esp+8] ; v
-
- movdqa xmm0, [ecx]
- movdqa [eax], xmm0
-
- ret
-
-ALIGN 16
-;***********************************************************************----------------
-; void WelsFillingPred1to16_sse2( uint8_t *pred, const uint8_t v );
-;***********************************************************************----------------
-WelsFillingPred1to16_sse2:
- mov eax, [esp+4] ; pred
-
- mov cl, byte [esp+8] ; v
- butterfly_1to16_sse xmm0, xmm1, c ; dst, tmp, pSrc [generic register name: a/b/c/d]
-
- movdqa [eax], xmm0
-
- ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ /dev/null
@@ -1,687 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mb_copy.asm
-;*
-;* Abstract
-;* mb_copy
-;*
-;*
-;*********************************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN WelsCopy16x16_sse2
-WELS_EXTERN WelsCopy16x16NotAligned_sse2
-WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2 ;
-WELS_EXTERN WelsCopy8x16_mmx ;
-WELS_EXTERN UpdateMbMv_sse2 ;
-
-;***********************************************************************
-; void WelsCopy16x16_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x16_sse2:
- push esi
- push edi
- push ebx
-
- mov edi, [esp+16] ; Dst
- mov eax, [esp+20] ; iStrideD
- mov esi, [esp+24] ; Src
- mov ecx, [esp+28] ; iStrideS
-
- lea ebx, [eax+2*eax] ; x3
- lea edx, [ecx+2*ecx] ; x3
-
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+ecx]
- movdqa xmm2, [esi+2*ecx]
- movdqa xmm3, [esi+edx]
- lea esi, [esi+4*ecx]
- movdqa xmm4, [esi]
- movdqa xmm5, [esi+ecx]
- movdqa xmm6, [esi+2*ecx]
- movdqa xmm7, [esi+edx]
- lea esi, [esi+4*ecx]
-
- movdqa [edi], xmm0
- movdqa [edi+eax], xmm1
- movdqa [edi+2*eax], xmm2
- movdqa [edi+ebx], xmm3
- lea edi, [edi+4*eax]
- movdqa [edi], xmm4
- movdqa [edi+eax], xmm5
- movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
- lea edi, [edi+4*eax]
-
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+ecx]
- movdqa xmm2, [esi+2*ecx]
- movdqa xmm3, [esi+edx]
- lea esi, [esi+4*ecx]
- movdqa xmm4, [esi]
- movdqa xmm5, [esi+ecx]
- movdqa xmm6, [esi+2*ecx]
- movdqa xmm7, [esi+edx]
-
- movdqa [edi], xmm0
- movdqa [edi+eax], xmm1
- movdqa [edi+2*eax], xmm2
- movdqa [edi+ebx], xmm3
- lea edi, [edi+4*eax]
- movdqa [edi], xmm4
- movdqa [edi+eax], xmm5
- movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
-
- pop ebx
- pop edi
- pop esi
- ret
-
-;***********************************************************************
-; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
-WelsCopy16x16NotAligned_sse2:
- push esi
- push edi
- push ebx
-
- mov edi, [esp+16] ; Dst
- mov eax, [esp+20] ; iStrideD
- mov esi, [esp+24] ; Src
- mov ecx, [esp+28] ; iStrideS
-
- lea ebx, [eax+2*eax] ; x3
- lea edx, [ecx+2*ecx] ; x3
-
- movdqu xmm0, [esi]
- movdqu xmm1, [esi+ecx]
- movdqu xmm2, [esi+2*ecx]
- movdqu xmm3, [esi+edx]
- lea esi, [esi+4*ecx]
- movdqu xmm4, [esi]
- movdqu xmm5, [esi+ecx]
- movdqu xmm6, [esi+2*ecx]
- movdqu xmm7, [esi+edx]
- lea esi, [esi+4*ecx]
-
- movdqa [edi], xmm0
- movdqa [edi+eax], xmm1
- movdqa [edi+2*eax], xmm2
- movdqa [edi+ebx], xmm3
- lea edi, [edi+4*eax]
- movdqa [edi], xmm4
- movdqa [edi+eax], xmm5
- movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
- lea edi, [edi+4*eax]
-
- movdqu xmm0, [esi]
- movdqu xmm1, [esi+ecx]
- movdqu xmm2, [esi+2*ecx]
- movdqu xmm3, [esi+edx]
- lea esi, [esi+4*ecx]
- movdqu xmm4, [esi]
- movdqu xmm5, [esi+ecx]
- movdqu xmm6, [esi+2*ecx]
- movdqu xmm7, [esi+edx]
-
- movdqa [edi], xmm0
- movdqa [edi+eax], xmm1
- movdqa [edi+2*eax], xmm2
- movdqa [edi+ebx], xmm3
- lea edi, [edi+4*eax]
- movdqa [edi], xmm4
- movdqa [edi+eax], xmm5
- movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
-
- pop ebx
- pop edi
- pop esi
- ret
-
-; , 12/29/2011
-;***********************************************************************
-; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy16x8NotAligned_sse2:
- push esi
- push edi
- push ebx
-
- mov edi, [esp+16] ; Dst
- mov eax, [esp+20] ; iStrideD
- mov esi, [esp+24] ; Src
- mov ecx, [esp+28] ; iStrideS
-
- lea ebx, [eax+2*eax] ; x3
- lea edx, [ecx+2*ecx] ; x3
-
- movdqu xmm0, [esi]
- movdqu xmm1, [esi+ecx]
- movdqu xmm2, [esi+2*ecx]
- movdqu xmm3, [esi+edx]
- lea esi, [esi+4*ecx]
- movdqu xmm4, [esi]
- movdqu xmm5, [esi+ecx]
- movdqu xmm6, [esi+2*ecx]
- movdqu xmm7, [esi+edx]
-
- movdqa [edi], xmm0
- movdqa [edi+eax], xmm1
- movdqa [edi+2*eax], xmm2
- movdqa [edi+ebx], xmm3
- lea edi, [edi+4*eax]
- movdqa [edi], xmm4
- movdqa [edi+eax], xmm5
- movdqa [edi+2*eax], xmm6
- movdqa [edi+ebx], xmm7
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-;***********************************************************************
-; void WelsCopy8x16_mmx(uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x16_mmx:
- push ebx
-
- mov eax, [esp + 8 ] ;Dst
- mov ecx, [esp + 12] ;iStrideD
- mov ebx, [esp + 16] ;Src
- mov edx, [esp + 20] ;iStrideS
-
- movq mm0, [ebx]
- movq mm1, [ebx+edx]
- lea ebx, [ebx+2*edx]
- movq mm2, [ebx]
- movq mm3, [ebx+edx]
- lea ebx, [ebx+2*edx]
- movq mm4, [ebx]
- movq mm5, [ebx+edx]
- lea ebx, [ebx+2*edx]
- movq mm6, [ebx]
- movq mm7, [ebx+edx]
- lea ebx, [ebx+2*edx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
- lea eax, [eax+2*ecx]
- movq [eax], mm2
- movq [eax+ecx], mm3
- lea eax, [eax+2*ecx]
- movq [eax], mm4
- movq [eax+ecx], mm5
- lea eax, [eax+2*ecx]
- movq [eax], mm6
- movq [eax+ecx], mm7
- lea eax, [eax+2*ecx]
-
- movq mm0, [ebx]
- movq mm1, [ebx+edx]
- lea ebx, [ebx+2*edx]
- movq mm2, [ebx]
- movq mm3, [ebx+edx]
- lea ebx, [ebx+2*edx]
- movq mm4, [ebx]
- movq mm5, [ebx+edx]
- lea ebx, [ebx+2*edx]
- movq mm6, [ebx]
- movq mm7, [ebx+edx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
- lea eax, [eax+2*ecx]
- movq [eax], mm2
- movq [eax+ecx], mm3
- lea eax, [eax+2*ecx]
- movq [eax], mm4
- movq [eax+ecx], mm5
- lea eax, [eax+2*ecx]
- movq [eax], mm6
- movq [eax+ecx], mm7
-
- WELSEMMS
- pop ebx
- ret
-
-;***********************************************************************
-; void WelsCopy8x8_mmx( uint8_t* Dst,
-; int32_t iStrideD,
-; uint8_t* Src,
-; int32_t iStrideS )
-;***********************************************************************
-ALIGN 16
-WelsCopy8x8_mmx:
- push ebx
- push esi
- mov eax, [esp + 12] ;Dst
- mov ecx, [esp + 16] ;iStrideD
- mov esi, [esp + 20] ;Src
- mov ebx, [esp + 24] ;iStrideS
- lea edx, [ebx+2*ebx]
-
- ; to prefetch next loop
- prefetchnta [esi+2*ebx]
- prefetchnta [esi+edx]
- movq mm0, [esi]
- movq mm1, [esi+ebx]
- lea esi, [esi+2*ebx]
- ; to prefetch next loop
- prefetchnta [esi+2*ebx]
- prefetchnta [esi+edx]
- movq mm2, [esi]
- movq mm3, [esi+ebx]
- lea esi, [esi+2*ebx]
- ; to prefetch next loop
- prefetchnta [esi+2*ebx]
- prefetchnta [esi+edx]
- movq mm4, [esi]
- movq mm5, [esi+ebx]
- lea esi, [esi+2*ebx]
- movq mm6, [esi]
- movq mm7, [esi+ebx]
-
- movq [eax], mm0
- movq [eax+ecx], mm1
- lea eax, [eax+2*ecx]
- movq [eax], mm2
- movq [eax+ecx], mm3
- lea eax, [eax+2*ecx]
- movq [eax], mm4
- movq [eax+ecx], mm5
- lea eax, [eax+2*ecx]
- movq [eax], mm6
- movq [eax+ecx], mm7
-
- WELSEMMS
- pop esi
- pop ebx
- ret
-
-; (dunhuang@cisco), 12/21/2011
-;***********************************************************************
-; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
-;***********************************************************************
-ALIGN 16
-UpdateMbMv_sse2:
- mov eax, [esp+4] ; mv_buffer
- movd xmm0, [esp+8] ; _mv
- pshufd xmm1, xmm0, $0
- movdqa [eax ], xmm1
- movdqa [eax+0x10], xmm1
- movdqa [eax+0x20], xmm1
- movdqa [eax+0x30], xmm1
- ret
-
-
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata pData align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-;read unaligned memory
-%macro SSE2_READ_UNA 2
- movq %1, [%2]
- movhps %1, [%2+8]
-%endmacro
-
-;write unaligned memory
-%macro SSE2_WRITE_UNA 2
- movq [%1], %2
- movhps [%1+8], %2
-%endmacro
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN PixelAvgWidthEq8_mmx
-WELS_EXTERN PixelAvgWidthEq16_sse2
-
-WELS_EXTERN McCopyWidthEq4_mmx
-WELS_EXTERN McCopyWidthEq8_mmx
-WELS_EXTERN McCopyWidthEq16_sse2
-
-
-ALIGN 16
-;***********************************************************************
-; void PixelAvgWidthEq8_mmx( uint8_t *dst, int32_t iDstStride,
-; uint8_t *pSrc1, int32_t iSrc1Stride,
-; uint8_t *pSrc2, int32_t iSrc2Stride,
-; int32_t iHeight );
-;***********************************************************************
-PixelAvgWidthEq8_mmx:
- push ebp
- push ebx
- push esi
- push edi
-
- mov edi, [esp+20]
- mov esi, [esp+28]
- mov edx, [esp+36]
- mov ebp, [esp+24]
- mov eax, [esp+32]
- mov ebx, [esp+40]
- mov ecx, [esp+44]
- sar ecx, 2
-.height_loop:
- movq mm0, [esi]
- pavgb mm0, [edx]
- movq [edi], mm0
- movq mm1, [esi+eax]
- pavgb mm1, [edx+ebx]
- movq [edi+ebp], mm1
- lea edi, [edi+2*ebp]
- lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
-
- movq mm2, [esi]
- pavgb mm2, [edx]
- movq [edi], mm2
- movq mm3, [esi+eax]
- pavgb mm3, [edx+ebx]
- movq [edi+ebp], mm3
- lea edi, [edi+2*ebp]
- lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
-
- dec ecx
- jne .height_loop
-
- WELSEMMS
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; void PixelAvgWidthEq16_sse2( uint8_t *dst, int32_t iDstStride,
-; uint8_t *pSrc1, int32_t iSrc1Stride,
-; uint8_t *pSrc2, int32_t iSrc2Stride,
-; int32_t iHeight );
-;***********************************************************************
-PixelAvgWidthEq16_sse2:
- push ebp
- push ebx
- push esi
- push edi
-
- mov edi, [esp+20]
- mov esi, [esp+28]
- mov edx, [esp+36]
- mov ebp, [esp+24]
- mov eax, [esp+32]
- mov ebx, [esp+40]
- mov ecx, [esp+44]
- sar ecx, 2
-.height_loop:
- movdqu xmm0, [esi]
- movdqu xmm1, [edx]
- movdqu xmm2, [esi+eax]
- movdqu xmm3, [edx+ebx]
- pavgb xmm0, xmm1
- pavgb xmm2, xmm3
- movdqu [edi], xmm0
- movdqu [edi+ebp], xmm2
- lea edi, [edi+2*ebp]
- lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
-
- movdqu xmm4, [esi]
- movdqu xmm5, [edx]
- movdqu xmm6, [esi+eax]
- movdqu xmm7, [edx+ebx]
- pavgb xmm4, xmm5
- pavgb xmm6, xmm7
- movdqu [edi], xmm4
- movdqu [edi+ebp], xmm6
- lea edi, [edi+2*ebp]
- lea esi, [esi+2*eax]
- lea edx, [edx+2*ebx]
-
- dec ecx
- jne .height_loop
-
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-
-
-ALIGN 64
-avg_w16_align_0_ssse3:
- movdqa xmm1, [ebx]
- movdqu xmm2, [ecx]
- pavgb xmm1, xmm2
- movdqa [edi], xmm1
- add ebx, eax
- add ecx, ebp
- add edi, esi
- dec dword [esp+4]
- jg avg_w16_align_0_ssse3
- ret
-
- ALIGN 64
-avg_w16_align_1_ssse3:
- movdqa xmm1, [ebx+16]
- movdqu xmm2, [ecx]
- palignr xmm1, [ebx], 1
- pavgb xmm1, xmm2
- movdqa [edi], xmm1
- add ebx, eax
- add ecx, ebp
- add edi, esi
- dec dword [esp+4]
- jg avg_w16_align_1_ssse3
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; void PixelAvgWidthEq16_ssse3(uint8_t *pDst, int32_t iDstStride,
-; uint8_t *pSrc1, int32_t iSrc1Stride,
-; uint8_t *pSrc2, int32_t iSrc2Stride,
-; int32_t iHeight );
-;***********************************************************************
-WELS_EXTERN PixelAvgWidthEq16_ssse3
-PixelAvgWidthEq16_ssse3:
- push ebp
- push ebx
- push esi
- push edi
-
- mov edi, [esp+20] ; dst
- mov ebx, [esp+28] ; src1
- mov ecx, [esp+36] ; src2
- mov esi, [esp+24] ; i_dst_stride
-
- %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
- mov edx, ebx
- and edx, 0x01
- lea eax, [avg_w16_align_0_ssse3]
- lea ebp, [avg_w16_offset]
- imul ebp, edx
- lea edx, [ebp+eax]
-
- mov eax, [esp+32]
- mov ebp, [esp+44]
- push ebp
- mov ebp, [esp+44]
- and ebx, 0xfffffff0
- call edx
- pop ebp
- pop edi
- pop esi
- pop ebx
- pop ebp
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McCopyWidthEq4_mmx( uint8_t *pSrc, int32_t iSrcStride,
-; uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-;*******************************************************************************
-McCopyWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
-
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov ecx, [esp+28]
- mov edx, [esp+32]
-ALIGN 4
-.height_loop:
- mov ebx, [esi]
- mov [edi], ebx
-
- add esi, eax
- add edi, ecx
- dec edx
- jnz .height_loop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-ALIGN 16
-;*******************************************************************************
-; void McCopyWidthEq8_mmx( uint8_t *pSrc, int32_t iSrcStride,
-; uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-;*******************************************************************************
-McCopyWidthEq8_mmx:
- push esi
- push edi
- mov esi, [esp+12]
- mov eax, [esp+16]
- mov edi, [esp+20]
- mov ecx, [esp+24]
- mov edx, [esp+28]
-
-ALIGN 4
-.height_loop:
- movq mm0, [esi]
- movq [edi], mm0
- add esi, eax
- add edi, ecx
- dec edx
- jnz .height_loop
-
- WELSEMMS
- pop edi
- pop esi
- ret
-
-ALIGN 16
-;***********************************************************************
-; void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
-;***********************************************************************
-McCopyWidthEq16_sse2:
- push esi
- push edi
-
- mov esi, [esp+12]
- mov eax, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- mov ecx, [esp+28]
-
-ALIGN 4
-.height_loop:
- SSE2_READ_UNA xmm0, esi
- SSE2_READ_UNA xmm1, esi+eax
- SSE2_WRITE_UNA edi, xmm0
- SSE2_WRITE_UNA edi+edx, xmm1
-
- sub ecx, 2
- lea esi, [esi+eax*2]
- lea edi, [edi+edx*2]
- jnz .height_loop
-
- pop edi
- pop esi
- ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ /dev/null
@@ -1,317 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2004-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_chroma.asm
-;*
-;* Abstract
-;* mmx motion compensation for chroma
-;*
-;* History
-;* 10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
- dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
- dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd mm3, [eax]
- WELS_Zero mm7
- punpcklbw mm3, mm3
- movq mm4, mm3
- punpcklwd mm3, mm3
- punpckhwd mm4, mm4
-
- movq mm5, mm3
- punpcklbw mm3, mm7
- punpckhbw mm5, mm7
-
- movq mm6, mm4
- punpcklbw mm4, mm7
- punpckhbw mm6, mm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movd mm0, [esi]
- movd mm1, [esi+1]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
-.xloop:
-
- pmullw mm0, mm3
- pmullw mm1, mm5
- paddw mm0, mm1
-
- movd mm1, [ebx]
- punpcklbw mm1, mm7
- movq mm2, mm1
- pmullw mm1, mm4
- paddw mm0, mm1
-
- movd mm1, [ebx+1]
- punpcklbw mm1, mm7
- movq mm7, mm1
- pmullw mm1,mm6
- paddw mm0, mm1
- movq mm1,mm7
-
- paddw mm0, [h264_d0x20_mmx]
- psrlw mm0, 6
-
- WELS_Zero mm7
- packuswb mm0, mm7
- movd [edi], mm0
-
- movq mm0, mm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
- WELSEMMS
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
- push esi
- push edi
- push ebx
-
- mov eax, [esp +12 + 20]
- movd xmm3, [eax]
- WELS_Zero xmm7
- punpcklbw xmm3, xmm3
- punpcklwd xmm3, xmm3
-
- movdqa xmm4, xmm3
- punpckldq xmm3, xmm3
- punpckhdq xmm4, xmm4
- movdqa xmm5, xmm3
- movdqa xmm6, xmm4
-
- punpcklbw xmm3, xmm7
- punpckhbw xmm5, xmm7
- punpcklbw xmm4, xmm7
- punpckhbw xmm6, xmm7
-
- mov esi, [esp +12+ 4]
- mov eax, [esp + 12 + 8]
- mov edi, [esp + 12 + 12]
- mov edx, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- lea ebx, [esi + eax]
- movq xmm0, [esi]
- movq xmm1, [esi+1]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
-.xloop:
-
- pmullw xmm0, xmm3
- pmullw xmm1, xmm5
- paddw xmm0, xmm1
-
- movq xmm1, [ebx]
- punpcklbw xmm1, xmm7
- movdqa xmm2, xmm1
- pmullw xmm1, xmm4
- paddw xmm0, xmm1
-
- movq xmm1, [ebx+1]
- punpcklbw xmm1, xmm7
- movdqa xmm7, xmm1
- pmullw xmm1, xmm6
- paddw xmm0, xmm1
- movdqa xmm1,xmm7
-
- paddw xmm0, [h264_d0x20_sse2]
- psrlw xmm0, 6
-
- WELS_Zero xmm7
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movdqa xmm0, xmm2
-
- lea edi, [edi +edx ]
- lea ebx, [ebx + eax]
-
- dec ecx
- jnz near .xloop
-
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; uint8_t *pABCD,
-; int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
- push ebx
- push esi
- push edi
-
- mov eax, [esp + 12 + 20]
-
- pxor xmm7, xmm7
- movd xmm5, [eax]
- punpcklwd xmm5, xmm5
- punpckldq xmm5, xmm5
- movdqa xmm6, xmm5
- punpcklqdq xmm5, xmm5
- punpckhqdq xmm6, xmm6
-
- mov eax, [esp + 12 + 4]
- mov edx, [esp + 12 + 8]
- mov esi, [esp + 12 + 12]
- mov edi, [esp + 12 + 16]
- mov ecx, [esp + 12 + 24]
-
- sub esi, edi
- sub esi, edi
- movdqa xmm7, [h264_d0x20_sse2]
-
- movdqu xmm0, [eax]
- movdqa xmm1, xmm0
- psrldq xmm1, 1
- punpcklbw xmm0, xmm1
-
-.hloop_chroma:
- lea esi, [esi+2*edi]
-
- movdqu xmm2, [eax+edx]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm4, xmm2
-
- pmaddubsw xmm0, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm0, xmm2
- paddw xmm0, xmm7
- psrlw xmm0, 6
- packuswb xmm0, xmm0
- movq [esi],xmm0
-
- lea eax, [eax+2*edx]
- movdqu xmm2, [eax]
- movdqa xmm3, xmm2
- psrldq xmm3, 1
- punpcklbw xmm2, xmm3
- movdqa xmm0, xmm2
-
- pmaddubsw xmm4, xmm5
- pmaddubsw xmm2, xmm6
- paddw xmm4, xmm2
- paddw xmm4, xmm7
- psrlw xmm4, 6
- packuswb xmm4, xmm4
- movq [esi+edi],xmm4
-
- sub ecx, 2
- jnz .hloop_chroma
- pop edi
- pop esi
- pop ebx
-
- ret
-
-
--- a/codec/encoder/core/asm/mc_luma.asm
+++ /dev/null
@@ -1,1052 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* mc_luma.asm
-;*
-;* Abstract
-;* sse2 motion compensation
-;*
-;* History
-;* 17/08/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-%macro SSE_LOAD_8P 3
- movq %1, %3
- punpcklbw %1, %2
-%endmacro
-
-%macro FILTER_HV_W8 9
- paddw %1, %6
- movdqa %8, %3
- movdqa %7, %2
- paddw %1, [h264_w0x10_1]
- paddw %8, %4
- paddw %7, %5
- psllw %8, 2
- psubw %8, %7
- paddw %1, %8
- psllw %8, 2
- paddw %1, %8
- psraw %1, 5
- WELS_Zero %8
- packuswb %1, %8
- movq %9, %1
-%endmacro
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer22Width8HorFirst_sse2
-WELS_EXTERN McHorVer02WidthEq8_sse2
-WELS_EXTERN McHorVer20WidthEq16_sse2
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer20WidthEq16_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iHeight,
-; );
-;***********************************************************************
-McHorVer20WidthEq16_sse2:
- push esi
- push edi
-
-
- mov esi, [esp + 12]
- mov eax, [esp + 16]
- mov edi, [esp + 20]
- mov ecx, [esp + 28]
- mov edx, [esp + 24]
- sub esi, 2
-
- WELS_Zero xmm7
- movdqa xmm6, [h264_w0x10_1]
-.y_loop:
-
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [edi], xmm0
-
- movq xmm0, [esi+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3+8]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, xmm6
- psraw xmm0, 5
- packuswb xmm0, xmm7
- movq [edi+8], xmm0
-
-
- add esi, eax
- add edi, edx
- dec ecx
- jnz .y_loop
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
-; int32_t iSrcStride,
-; uint8_t* pTap,
-; int32_t iTapStride,
-; int32_t iHeight);
-;***********************************************************************
-McHorVer22Width8HorFirst_sse2:
- push esi
- push edi
- push ebx
- mov esi, [esp+16] ;pSrc
- mov eax, [esp+20] ;src_stride
- mov edi, [esp+24] ;tap
- mov edx, [esp+28] ;tap_stride
- mov ebx, [esp+32] ;i_height
- pxor xmm7, xmm7
-
- sub esi, eax ;;;;;;;;need more 5 lines.
- sub esi, eax
-
-.yloop_width_8:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [edi], xmm0
-
- add esi, eax
- add edi, edx
- dec ebx
- jnz .yloop_width_8
- pop ebx
- pop edi
- pop esi
- ret
-
-;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iHeight )
-;***********************************************************************
-ALIGN 16
-McHorVer02WidthEq8_sse2:
- push esi
- push edi
-
- mov esi, [esp + 12]
- mov edx, [esp + 16]
- mov edi, [esp + 20]
- mov eax, [esp + 24]
- mov ecx, [esp + 28]
-
- sub esi, edx
- sub esi, edx
-
- WELS_Zero xmm7
-
- SSE_LOAD_8P xmm0, xmm7, [esi]
- SSE_LOAD_8P xmm1, xmm7, [esi+edx]
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm2, xmm7, [esi]
- SSE_LOAD_8P xmm3, xmm7, [esi+edx]
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm4, xmm7, [esi]
- SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm6, xmm7, [esi]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm7, xmm0, [esi+edx]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm0, xmm1, [esi]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm1, xmm2, [esi+edx]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm2, xmm3, [esi]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm3, xmm4, [esi+edx]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
- dec ecx
- jz near .xx_exit
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm4, xmm5, [esi]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
- dec ecx
- jz near .xx_exit
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm5, xmm6, [esi+edx]
- jmp near .start
-
-.xx_exit:
- pop edi
- pop esi
- ret
-
-
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-ALIGN 16
-h264_w0x10_1:
- dw 16, 16, 16, 16, 16, 16, 16, 16
-ALIGN 16
-h264_mc_hc_32:
- dw 32, 32, 32, 32, 32, 32, 32, 32
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN McHorVer20_sse2
-WELS_EXTERN McHorVer02_sse2
-WELS_EXTERN McHorVer22VerLastAlign_sse2
-WELS_EXTERN McHorVer22VerLastUnAlign_sse2
-WELS_EXTERN McHorVer22HorFirst_sse2
-
-
-;***********************************************************************
-; void McHorVer02_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight )
-;***********************************************************************
-ALIGN 16
-McHorVer02_sse2:
- push esi
- push edi
- push ebx
-
- mov esi, [esp + 16]
- mov edx, [esp + 20]
- mov edi, [esp + 24]
- mov eax, [esp + 28]
- mov ecx, [esp + 36]
- mov ebx, [esp + 32]
- shr ebx, 3
- sub esi, edx
- sub esi, edx
-
-.xloop:
- WELS_Zero xmm7
- SSE_LOAD_8P xmm0, xmm7, [esi]
- SSE_LOAD_8P xmm1, xmm7, [esi+edx]
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm2, xmm7, [esi]
- SSE_LOAD_8P xmm3, xmm7, [esi+edx]
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm4, xmm7, [esi]
- SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm6, xmm7, [esi]
- movdqa xmm0,xmm1
- movdqa xmm1,xmm2
- movdqa xmm2,xmm3
- movdqa xmm3,xmm4
- movdqa xmm4,xmm5
- movdqa xmm5,xmm6
- add edi, eax
- sub esi, edx
-
-.start:
- FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm6, xmm7, [esi]
- FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm7, xmm0, [esi+edx]
- FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm0, xmm1, [esi]
- FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [edi+eax]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm1, xmm2, [esi+edx]
- FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm2, xmm3, [esi]
- FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [edi+eax]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm3, xmm4, [esi+edx]
- FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*edx]
- SSE_LOAD_8P xmm4, xmm5, [esi]
- FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [edi+eax]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*eax]
- SSE_LOAD_8P xmm5, xmm6, [esi+edx]
- jmp near .start
-
-.x_loop_dec:
- dec ebx
- jz near .xx_exit
- mov esi, [esp + 16]
- mov edi, [esp + 24]
- sub esi, edx
- sub esi, edx
- add esi, 8
- add edi, 8
- mov ecx, [esp + 36]
- jmp near .xloop
-
-.xx_exit:
- pop ebx
- pop edi
- pop esi
- ret
-
-
-ALIGN 16
-;***********************************************************************
-; void McHorVer20_sse2( uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t *pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight
-; );
-;***********************************************************************
-McHorVer20_sse2:
- push esi
- push edi
- push ebx
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov edx, [esp+28]
- mov ecx, [esp+32]
- mov ebx, [esp+36]
- sub esi, 2
- pxor xmm7, xmm7
-
- cmp ecx, 9
- jne near .width_17
-
-.yloop_width_9:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [edi], xmm0
-
- pxor xmm7, xmm7
- movq xmm0, [esi+6]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [edi+1], xmm2
-
- add esi, eax
- add edi, edx
- dec ebx
- jnz .yloop_width_9
- pop ebx
- pop edi
- pop esi
- ret
-
-
-.width_17:
-.yloop_width_17:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movq [edi], xmm0
-
- movq xmm0, [esi+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3+8]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- paddw xmm0, [h264_w0x10_1]
- psraw xmm0, 5
- packuswb xmm0, xmm0
- movd [edi+8], xmm0
-
-
- pxor xmm7, xmm7
- movq xmm0, [esi+6+8]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- paddw xmm2, [h264_w0x10_1]
- psraw xmm2, 5
- packuswb xmm2, xmm2
- movq [edi+9], xmm2
- add esi, eax
- add edi, edx
- dec ebx
- jnz .yloop_width_17
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-ALIGN 16
-;***********************************************************************
-;void McHorVer22HorFirst_sse2
-; (uint8_t *pSrc,
-; int32_t iSrcStride,
-; uint8_t * pTap,
-; int32_t iTapStride,
-; int32_t iWidth,int32_t iHeight);
-;***********************************************************************
-McHorVer22HorFirst_sse2:
- push esi
- push edi
- push ebx
- mov esi, [esp+16]
- mov eax, [esp+20]
- mov edi, [esp+24]
- mov edx, [esp+28]
- mov ecx, [esp+32]
- mov ebx, [esp+36]
- pxor xmm7, xmm7
-
- sub esi, eax ;;;;;;;;need more 5 lines.
- sub esi, eax
-
- cmp ecx, 9
- jne near .width_17
-
-.yloop_width_9:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [edi], xmm0
-
- pxor xmm7, xmm7
- movq xmm0, [esi+6]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [edi+2], xmm2
- movhps [edi+2+8], xmm2
-
- add esi, eax
- add edi, edx
- dec ebx
- jnz .yloop_width_9
- pop ebx
- pop edi
- pop esi
- ret
-
-
-.width_17:
-.yloop_width_17:
- movq xmm0, [esi]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3]
- punpcklbw xmm5, xmm7
-
- paddw xmm2, xmm3
- paddw xmm4, xmm5
- psllw xmm4, 2
- psubw xmm4, xmm2
- paddw xmm0, xmm1
- paddw xmm0, xmm4
- psllw xmm4, 2
- paddw xmm0, xmm4
- movdqa [edi], xmm0
-
- movq xmm0, [esi+8]
- punpcklbw xmm0, xmm7
- movq xmm1, [esi+5+8]
- punpcklbw xmm1, xmm7
- movq xmm2, [esi+1+8]
- punpcklbw xmm2, xmm7
- movq xmm3, [esi+4+8]
- punpcklbw xmm3, xmm7
- movq xmm4, [esi+2+8]
- punpcklbw xmm4, xmm7
- movq xmm5, [esi+3+8]
- punpcklbw xmm5, xmm7
-
- movdqa xmm7, xmm2
- paddw xmm7, xmm3
- movdqa xmm6, xmm4
- paddw xmm6, xmm5
- psllw xmm6, 2
- psubw xmm6, xmm7
- paddw xmm0, xmm1
- paddw xmm0, xmm6
- psllw xmm6, 2
- paddw xmm0, xmm6
- movd [edi+16], xmm0
-
-
- pxor xmm7, xmm7
- movq xmm0, [esi+6+8]
- punpcklbw xmm0, xmm7
-
- paddw xmm4, xmm1
- paddw xmm5, xmm3
- psllw xmm5, 2
- psubw xmm5, xmm4
- paddw xmm2, xmm0
- paddw xmm2, xmm5
- psllw xmm5, 2
- paddw xmm2, xmm5
- movq [edi+18], xmm2
- movhps [edi+18+8], xmm2
-
- add esi, eax
- add edi, edx
- dec ebx
- jnz .yloop_width_17
- pop ebx
- pop edi
- pop esi
- ret
-
-
-%macro FILTER_VER 9
- paddw %1, %6
- movdqa %7, %2
- movdqa %8, %3
-
-
- paddw %7, %5
- paddw %8, %4
-
- psubw %1, %7
- psraw %1, 2
- paddw %1, %8
- psubw %1, %7
- psraw %1, 2
- paddw %8, %1
- paddw %8, [h264_mc_hc_32]
- psraw %8, 6
- packuswb %8, %8
- movq %9, %8
-%endmacro
-;***********************************************************************
-;void McHorVer22VerLastAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
- McHorVer22VerLastAlign_sse2:
- push esi
- push edi
- push ebx
- push ebp
-
- mov esi, [esp+20]
- mov eax, [esp+24]
- mov edi, [esp+28]
- mov edx, [esp+32]
- mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
-.width_loop:
- movdqa xmm0, [esi]
- movdqa xmm1, [esi+eax]
- lea esi, [esi+2*eax]
- movdqa xmm2, [esi]
- movdqa xmm3, [esi+eax]
- lea esi, [esi+2*eax]
- movdqa xmm4, [esi]
- movdqa xmm5, [esi+eax]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- lea esi, [esi+2*eax]
- movdqa xmm6, [esi]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add edi, edx
- sub esi, eax
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm6, [esi]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm7, [esi+eax]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm0, [esi]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm1, [esi+eax]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm2, [esi]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm3, [esi+eax]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqa xmm4, [esi]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqa xmm5, [esi+eax]
- jmp near .start
-
-.x_loop_dec:
- dec ebx
- jz near .exit
- mov esi, [esp+20]
- mov edi, [esp+28]
- mov ecx, [esp+40]
- add esi, 16
- add edi, 8
- jmp .width_loop
-
-
-
-.exit:
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
-
-;***********************************************************************
-;void McHorVer22VerLastUnAlign_sse2(
-; uint8_t *pTap,
-; int32_t iTapStride,
-; uint8_t * pDst,
-; int32_t iDstStride,
-; int32_t iWidth,
-; int32_t iHeight);
-;***********************************************************************
-
- McHorVer22VerLastUnAlign_sse2:
- push esi
- push edi
- push ebx
- push ebp
-
- mov esi, [esp+20]
- mov eax, [esp+24]
- mov edi, [esp+28]
- mov edx, [esp+32]
- mov ebx, [esp+36]
- mov ecx, [esp+40]
- shr ebx, 3
-
-.width_loop:
- movdqu xmm0, [esi]
- movdqu xmm1, [esi+eax]
- lea esi, [esi+2*eax]
- movdqu xmm2, [esi]
- movdqu xmm3, [esi+eax]
- lea esi, [esi+2*eax]
- movdqu xmm4, [esi]
- movdqu xmm5, [esi+eax]
-
- FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- lea esi, [esi+2*eax]
- movdqu xmm6, [esi]
-
- movdqa xmm0, xmm1
- movdqa xmm1, xmm2
- movdqa xmm2, xmm3
- movdqa xmm3, xmm4
- movdqa xmm4, xmm5
- movdqa xmm5, xmm6
-
- add edi, edx
- sub esi, eax
-
-.start:
- FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqu xmm6, [esi]
- FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqu xmm7, [esi+eax]
- FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqu xmm0, [esi]
- FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqu xmm1, [esi+eax]
- FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqu xmm2, [esi]
- FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqu xmm3, [esi+eax]
- FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
- dec ecx
- jz near .x_loop_dec
-
- lea esi, [esi+2*eax]
- movdqu xmm4, [esi]
- FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
- dec ecx
- jz near .x_loop_dec
-
- lea edi, [edi+2*edx]
- movdqu xmm5, [esi+eax]
- jmp near .start
-
-.x_loop_dec:
- dec ebx
- jz near .exit
- mov esi, [esp+20]
- mov edi, [esp+28]
- mov ecx, [esp+40]
- add esi, 16
- add edi, 8
- jmp .width_loop
-
-
-
-.exit:
- pop ebp
- pop ebx
- pop edi
- pop esi
- ret
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
;* memzero.asm
;*
;* Abstract
+;*
;*
-;*
;* History
;* 9/16/2009 Created
;*
@@ -40,15 +40,13 @@
;*
;*************************************************************************/
-BITS 32
-
%include "asm_inc.asm"
;***********************************************************************
; Code
;***********************************************************************
-SECTION .text
-
+SECTION .text
+
ALIGN 16
;***********************************************************************
;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -55,9 +53,11 @@
;***********************************************************************
WELS_EXTERN WelsPrefetchZero_mmx
WelsPrefetchZero_mmx:
- mov eax,[esp+4]
- prefetchnta [eax]
- ret
+ %assign push_num 0
+ LOAD_1_PARA
+ ;mov eax,[esp+4]
+ prefetchnta [r0]
+ ret
ALIGN 16
@@ -66,23 +66,25 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroAligned64_sse2
WelsSetMemZeroAligned64_sse2:
- mov eax, [esp + 4] ; dst
- mov ecx, [esp + 8]
- neg ecx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENTION r1, r1d
+ neg r1
+
pxor xmm0, xmm0
.memzeroa64_sse2_loops:
- movdqa [eax], xmm0
- movdqa [eax+16], xmm0
- movdqa [eax+32], xmm0
- movdqa [eax+48], xmm0
- add eax, 0x40
-
- add ecx, 0x40
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ add r0, 0x40
+
+ add r1, 0x40
jnz near .memzeroa64_sse2_loops
+
+ ret
- ret
-
ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -89,47 +91,51 @@
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize64_mmx
WelsSetMemZeroSize64_mmx:
- mov eax, [esp + 4] ; dst
- mov ecx, [esp + 8]
- neg ecx
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENTION r1, r1d
+ neg r1
+
pxor mm0, mm0
.memzero64_mmx_loops:
- movq [eax], mm0
- movq [eax+8], mm0
- movq [eax+16], mm0
- movq [eax+24], mm0
- movq [eax+32], mm0
- movq [eax+40], mm0
- movq [eax+48], mm0
- movq [eax+56], mm0
- add eax, 0x40
-
- add ecx, 0x40
+ movq [r0], mm0
+ movq [r0+8], mm0
+ movq [r0+16], mm0
+ movq [r0+24], mm0
+ movq [r0+32], mm0
+ movq [r0+40], mm0
+ movq [r0+48], mm0
+ movq [r0+56], mm0
+ add r0, 0x40
+
+ add r1, 0x40
jnz near .memzero64_mmx_loops
-
- WELSEMMS
- ret
-
-ALIGN 16
+
+ WELSEMMS
+ ret
+
+ALIGN 16
;***********************************************************************
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
;***********************************************************************
WELS_EXTERN WelsSetMemZeroSize8_mmx
WelsSetMemZeroSize8_mmx:
- mov eax, [esp + 4] ; dst
- mov ecx, [esp + 8] ; size
- neg ecx
- pxor mm0, mm0
+ %assign push_num 0
+ LOAD_2_PARA
+ SIGN_EXTENTION r1, r1d
+ neg r1
+ pxor mm0, mm0
+
.memzero8_mmx_loops:
- movq [eax], mm0
- add eax, 0x08
-
- add ecx, 0x08
+ movq [r0], mm0
+ add r0, 0x08
+
+ add r1, 0x08
jnz near .memzero8_mmx_loops
+
+ WELSEMMS
+ ret
- WELSEMMS
- ret
-
-
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -42,7 +42,6 @@
%include "asm_inc.asm"
-BITS 32
SECTION .text
;************************************************
@@ -86,14 +85,16 @@
WELS_EXTERN WelsQuant4x4_sse2
align 16
WelsQuant4x4_sse2:
- mov eax, [ff]
- mov ecx, [mf]
- MOVDQ xmm2, [eax]
- MOVDQ xmm3, [ecx]
+ %assign push_num 0
+ LOAD_3_PARA
+ ;mov eax, [ff]
+ ;mov ecx, [mf]
+ movdqa xmm2, [r1]
+ movdqa xmm3, [r2]
- mov edx, [pDct]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+ ;mov edx, [pDct]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret
@@ -103,15 +104,21 @@
WELS_EXTERN WelsQuant4x4Dc_sse2
align 16
WelsQuant4x4Dc_sse2:
- mov ax, [mf]
- SSE2_Copy8Times xmm3, eax
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r1, r1w
+ movsx r2, r2w
+ %endif
+ ;mov ax, [mf]
+ SSE2_Copy8Times xmm3, r2d
- mov cx, [ff]
- SSE2_Copy8Times xmm2, ecx
+ ;mov cx, [ff]
+ SSE2_Copy8Times xmm2, r1d
- mov edx, [pDct]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
+ ;mov edx, [pDct]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
ret
@@ -121,20 +128,22 @@
WELS_EXTERN WelsQuantFour4x4_sse2
align 16
WelsQuantFour4x4_sse2:
- mov eax, [ff]
- mov ecx, [mf]
- MOVDQ xmm2, [eax]
- MOVDQ xmm3, [ecx]
+ %assign push_num 0
+ LOAD_3_PARA
+ ;mov eax, [ff]
+ ;mov ecx, [mf]
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
- mov edx, [pDct]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
- SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
+ ;mov edx, [pDct]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
+ SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
ret
@@ -144,24 +153,26 @@
WELS_EXTERN WelsQuantFour4x4Max_sse2
align 16
WelsQuantFour4x4Max_sse2:
- mov eax, [ff]
- mov ecx, [mf]
- MOVDQ xmm2, [eax]
- MOVDQ xmm3, [ecx]
+ %assign push_num 0
+ LOAD_4_PARA
+ ;mov eax, [ff]
+ ;mov ecx, [mf]
+ MOVDQ xmm2, [r1]
+ MOVDQ xmm3, [r2]
- mov edx, [pDct]
+ ;mov edx, [pDct]
pxor xmm4, xmm4
pxor xmm5, xmm5
pxor xmm6, xmm6
pxor xmm7, xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx ], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x40], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
- SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
+ SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
pmaxsw xmm0, xmm4
@@ -171,9 +182,9 @@
punpckhqdq xmm0, xmm1
pmaxsw xmm0, xmm1
- mov edx, [max]
- movq [edx], xmm0
-
+ ;mov r0, [r3]
+ movq [r3], xmm0
+ LOAD_4_PARA_POP
ret
%macro MMX_Copy4Times 2
@@ -203,21 +214,20 @@
WELS_EXTERN WelsHadamardQuant2x2_mmx
align 16
WelsHadamardQuant2x2_mmx:
-
- mov eax, [pDct]
- movd mm0, [eax]
- movd mm1, [eax + 0x20]
+ %assign push_num 0
+ LOAD_5_PARA
+ %ifndef X86_32
+ movsx r1, r1w
+ movsx r2, r2w
+ %endif
+ ;mov eax, [pDct]
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1
- movd mm3, [eax + 0x40]
- movd mm1, [eax + 0x60]
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1
- mov cx, 0
- mov [eax], cx
- mov [eax + 0x20], cx
- mov [eax + 0x40], cx
- mov [eax + 0x60], cx
-
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
movq mm5, mm3
paddw mm3, mm0
@@ -231,17 +241,17 @@
punpcklwd mm1, mm3
;quant_2x2_dc
- mov ax, [mf]
- MMX_Copy4Times mm3, eax
- mov cx, [ff]
- MMX_Copy4Times mm2, ecx
+ ;mov ax, [mf]
+ MMX_Copy4Times mm3, r2d
+ ;mov cx, [ff]
+ MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3
; store dct_2x2
- mov edx, [dct2x2]
- movq [edx], mm1
- mov ecx, [iChromaDc]
- movq [ecx], mm1
+ ;mov edx, [dct2x2]
+ movq [r3], mm1
+ ;mov ecx, [iChromaDc]
+ movq [r4], mm1
; pNonZeroCount of dct_2x2
pcmpeqb mm2, mm2 ; mm2 = FF
@@ -250,9 +260,17 @@
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
- movd eax, mm1
+ mov r1w, 0
+ mov [r0], r1w
+ mov [r0 + 0x20], r1w
+ mov [r0 + 0x40], r1w
+ mov [r0 + 0x60], r1w
+
+ movd retrd, mm1
+
WELSEMMS
+ LOAD_5_PARA_POP
ret
;***********************************************************************
@@ -261,13 +279,18 @@
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
align 16
WelsHadamardQuant2x2Skip_mmx:
-
- mov eax, [pDct]
- movd mm0, [eax]
- movd mm1, [eax + 0x20]
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r1, r1w
+ movsx r2, r2w
+ %endif
+ ;mov eax, [pDct]
+ movd mm0, [r0]
+ movd mm1, [r0 + 0x20]
punpcklwd mm0, mm1
- movd mm3, [eax + 0x40]
- movd mm1, [eax + 0x60]
+ movd mm3, [r0 + 0x40]
+ movd mm1, [r0 + 0x60]
punpcklwd mm3, mm1
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
@@ -283,10 +306,10 @@
punpcklwd mm1, mm3
;quant_2x2_dc
- mov ax, [mf]
- MMX_Copy4Times mm3, eax
- mov cx, [ff]
- MMX_Copy4Times mm2, ecx
+ ;mov ax, [mf]
+ MMX_Copy4Times mm3, r2d
+ ;mov cx, [ff]
+ MMX_Copy4Times mm2, r1d
MMX_Quant4 mm1, mm0, mm2, mm3
; pNonZeroCount of dct_2x2
@@ -296,7 +319,7 @@
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
psadbw mm1, mm3 ;
- movd eax, mm1
+ movd retrd, mm1
WELSEMMS
ret
@@ -317,12 +340,14 @@
WELS_EXTERN WelsDequant4x4_sse2
WelsDequant4x4_sse2:
;ecx = dequant_mf[qp], edx = pDct
- mov ecx, [esp + 8]
- mov edx, [esp + 4]
+ %assign push_num 0
+ LOAD_2_PARA
+ ;mov ecx, [esp + 8]
+ ;mov edx, [esp + 4]
- movdqa xmm1, [ecx]
- SSE2_DeQuant8 [edx ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x10 ], xmm0, xmm1
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
ret
@@ -335,18 +360,20 @@
WELS_EXTERN WelsDequantFour4x4_sse2
WelsDequantFour4x4_sse2:
;ecx = dequant_mf[qp], edx = pDct
- mov ecx, [esp + 8]
- mov edx, [esp + 4]
+ %assign push_num 0
+ LOAD_2_PARA
+ ;mov ecx, [esp + 8]
+ ;mov edx, [esp + 4]
- movdqa xmm1, [ecx]
- SSE2_DeQuant8 [edx ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x10 ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x20 ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x30 ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x40 ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x50 ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x60 ], xmm0, xmm1
- SSE2_DeQuant8 [edx+0x70 ], xmm0, xmm1
+ movdqa xmm1, [r1]
+ SSE2_DeQuant8 [r0 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
+ SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
ret
@@ -356,14 +383,19 @@
WELS_EXTERN WelsDequantIHadamard4x4_sse2
align 16
WelsDequantIHadamard4x4_sse2:
- mov eax, [esp + 4]
- mov cx, [esp + 8]
+ %assign push_num 0
+ LOAD_2_PARA
+ %ifndef X86_32
+ movzx r1, r1w
+ %endif
+ ;mov eax, [esp + 4]
+ ;mov cx, [esp + 8]
; WelsDequantLumaDc4x4
- SSE2_Copy8Times xmm1, ecx
+ SSE2_Copy8Times xmm1, r1d
;psrlw xmm1, 2 ; for the (>>2) in ihdm
- MOVDQ xmm0, [eax]
- MOVDQ xmm2, [eax+0x10]
+ MOVDQ xmm0, [r0]
+ MOVDQ xmm2, [r0+0x10]
pmullw xmm0, xmm1
pmullw xmm2, xmm1
@@ -386,8 +418,8 @@
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
punpcklqdq xmm0, xmm1
- MOVDQ [eax], xmm0
+ MOVDQ [r0], xmm0
punpcklqdq xmm2, xmm3
- MOVDQ [eax+16], xmm2
+ MOVDQ [r0+16], xmm2
ret
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -1,2189 +1,2344 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* satd_sad.asm
-;*
-;* Abstract
-;* WelsSampleSatd4x4_sse2
-;* WelsSampleSatd8x8_sse2
-;* WelsSampleSatd16x8_sse2
-;* WelsSampleSatd8x16_sse2
-;* WelsSampleSatd16x16_sse2
-;*
-;* WelsSampleSad16x8_sse2
-;* WelsSampleSad16x16_sse2
-;*
-;* History
-;* 8/5/2009 Created
-;* 24/9/2009 modified
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-
-;***********************************************************************
-; Data
-;***********************************************************************
-SECTION .rodata align=16
-
-align 16
-HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
-align 16
-HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
-align 16
-PDW1: dw 1,1,1,1,1,1,1,1
-align 16
-PDQ2: dw 2,0,0,0,2,0,0,0
-align 16
-HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 BEGIN
-;
-;***********************************************************************
-%macro MMX_DW_1_2REG 2
- pxor %1, %1
- pcmpeqw %2, %2
- psubw %1, %2
-%endmacro
-
-%macro SSE2_SumWHorizon1 2
- movdqa %2, %1
- psrldq %2, 8
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 4
- paddusw %1, %2
- movdqa %2, %1
- psrldq %2, 2
- paddusw %1, %2
-%endmacro
-
-%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
- SSE2_SumSub %1, %2, %5
- SSE2_SumSub %3, %4, %5
- SSE2_SumSub %2, %4, %5
- SSE2_SumSub %1, %3, %5
-%endmacro
-
-%macro SSE2_SumAbs4 7
- WELS_AbsW %1, %3
- WELS_AbsW %2, %3
- WELS_AbsW %4, %6
- WELS_AbsW %5, %6
- paddusw %1, %2
- paddusw %4, %5
- paddusw %7, %1
- paddusw %7, %4
-%endmacro
-
-%macro SSE2_SumWHorizon 3
- movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
- paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
- punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
- movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
- paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
- pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
- paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
-%endmacro
-
-%macro SSE2_GetSatd8x8 0
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_LoadDiff8P xmm0,xmm4,xmm7,[eax],[ecx]
- SSE2_LoadDiff8P xmm1,xmm5,xmm7,[eax+ebx],[ecx+edx]
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_LoadDiff8P xmm2,xmm4,xmm7,[eax],[ecx]
- SSE2_LoadDiff8P xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
- SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse2
-align 16
-WelsSampleSatd4x4_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
-
- movd xmm0, [eax]
- movd xmm1, [eax+ebx]
- lea eax , [eax+2*ebx]
- movd xmm2, [eax]
- movd xmm3, [eax+ebx]
- punpckldq xmm0, xmm2
- punpckldq xmm1, xmm3
-
- movd xmm4, [ecx]
- movd xmm5, [ecx+edx]
- lea ecx , [ecx+2*edx]
- movd xmm6, [ecx]
- movd xmm7, [ecx+edx]
- punpckldq xmm4, xmm6
- punpckldq xmm5, xmm7
-
- pxor xmm6, xmm6
- punpcklbw xmm0, xmm6
- punpcklbw xmm1, xmm6
- punpcklbw xmm4, xmm6
- punpcklbw xmm5, xmm6
-
- psubw xmm0, xmm4
- psubw xmm1, xmm5
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
- SSE2_XSawp qdq, xmm0, xmm2, xmm3
-
- movdqa xmm4, xmm0
- paddw xmm0, xmm3
- psubw xmm4, xmm3
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm4
- punpckhwd xmm4, xmm2
-
- SSE2_XSawp dq, xmm0, xmm4, xmm3
- SSE2_XSawp qdq, xmm0, xmm3, xmm5
-
- movdqa xmm7, xmm0
- paddw xmm0, xmm5
- psubw xmm7, xmm5
-
- SSE2_XSawp qdq, xmm0, xmm7, xmm1
-
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- psubw xmm2, xmm1
-
- WELS_AbsW xmm0, xmm3
- paddusw xmm6, xmm0
- WELS_AbsW xmm2, xmm4
- paddusw xmm6, xmm2
- SSE2_SumWHorizon1 xmm6, xmm4
- movd eax, xmm6
- and eax, 0xffff
- shr eax, 1
- pop ebx
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x8_sse2
-align 16
- WelsSampleSatd8x8_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm6, xmm6
- pxor xmm7, xmm7
- SSE2_GetSatd8x8
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd eax, xmm6
- pop ebx
- ret
-
- ;***********************************************************************
- ;
- ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
- ;
- ;***********************************************************************
- WELS_EXTERN WelsSampleSatd8x16_sse2
-align 16
- WelsSampleSatd8x16_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd eax, xmm6
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse2
-align 16
-WelsSampleSatd16x8_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- mov eax, [esp+8]
- mov ecx, [esp+16]
- add eax, 8
- add ecx, 8
- SSE2_GetSatd8x8
-
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd eax, xmm6
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x16_sse2
-align 16
-WelsSampleSatd16x16_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- SSE2_GetSatd8x8
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSatd8x8
-
- mov eax, [esp+8]
- mov ecx, [esp+16]
- add eax, 8
- add ecx, 8
-
- SSE2_GetSatd8x8
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSatd8x8
-
- ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
- psrlw xmm6, 1
- SSE2_SumWHorizon xmm6,xmm4,xmm7
- movd eax, xmm6
- pop ebx
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse2 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
- paddd xmm4, %1 ;for dc
- paddd xmm4, %3 ;for dc
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
- pmaddubsw %1, xmm5
- movdqa %2, %1
- pmaddwd %1, xmm7
- pmaddwd %2, xmm6
- movdqa %3, %1
- punpckldq %1, %2
- punpckhdq %2, %3
- movdqa %3, %1
- punpcklqdq %1, %2
- punpckhqdq %3, %2
-; paddd xmm4, %1 ;for dc
-; paddd xmm4, %3 ;for dc
- movdqa %4, %1
- punpcklqdq %4, %3
- packssdw %1, %3
- psllw %1, 2
-%endmacro
-
-%macro SSE41_GetX38x4SatdDec 0
- pxor xmm7, xmm7
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movq xmm2, [eax]
- movq xmm3, [eax+ebx]
- lea eax, [eax+2*ebx]
- punpcklbw xmm0, xmm7
- punpcklbw xmm1, xmm7
- punpcklbw xmm2, xmm7
- punpcklbw xmm3, xmm7
- SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
- SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
- SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
- ;doesn't need another transpose
-%endmacro
-%macro SSE41_GetX38x4SatdV 2
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2], 0
- pinsrw xmm0, word[esi+%2+8], 4
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+2], 0
- pinsrw xmm0, word[esi+%2+10], 4
- psubsw xmm0, xmm1
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+4], 0
- pinsrw xmm0, word[esi+%2+12], 4
- psubsw xmm0, xmm3
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
- pxor xmm0, xmm0
- pinsrw xmm0, word[esi+%2+6], 0
- pinsrw xmm0, word[esi+%2+14], 4
- psubsw xmm0, xmm2
- pabsw xmm0, xmm0
- paddw xmm4, xmm0
-%endmacro
-%macro SSE41_GetX38x4SatdH 3
- movq xmm0, [esi+%3+8*%1]
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm5, xmm0
- pabsw xmm1, xmm1
- pabsw xmm2, xmm2
- pabsw xmm3, xmm3
- paddw xmm2, xmm1;for DC
- paddw xmm2, xmm3;for DC
- paddw xmm5, xmm2
-%endmacro
-%macro SSE41_I16X16GetX38x4SatdDC 0
- pxor xmm0, xmm0
- movq2dq xmm0, mm4
- punpcklqdq xmm0, xmm0
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_ChromaGetX38x4SatdDC 1
- shl %1, 4
- movdqa xmm0, [esi+32+%1]
- psubsw xmm0, xmm7
- pabsw xmm0, xmm0
- paddw xmm6, xmm0
- paddw xmm6, xmm2
-%endmacro
-%macro SSE41_I16x16GetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 32
- SSE41_I16X16GetX38x4SatdDC
-%endmacro
-%macro SSE41_ChromaGetX38x4Satd 2
- SSE41_GetX38x4SatdDec
- SSE41_GetX38x4SatdV %1, %2
- SSE41_GetX38x4SatdH %1, %2, 16
- SSE41_ChromaGetX38x4SatdDC %1
-%endmacro
-%macro SSE41_HSum8W 3
- pmaddwd %1, %2
- movhlps %3, %1
- paddd %1, %3
- pshuflw %3, %1,0Eh
- paddd %1, %3
-%endmacro
-
-WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
-WelsIntra16x16Combined3Satd_sse41:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- pxor xmm4, xmm4
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movdqu xmm0, [ecx]
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi], xmm0 ;V
- movdqa [esi+16], xmm1
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 8
- pinsrb xmm0, byte[ecx+edx-1], 9
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 10
- pinsrb xmm0, byte[ecx+edx-1], 11
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 12
- pinsrb xmm0, byte[ecx+edx-1], 13
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 14
- pinsrb xmm0, byte[ecx+edx-1], 15
- movhlps xmm1, xmm0
- punpcklqdq xmm0, xmm0
- punpcklqdq xmm1, xmm1
- SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
- SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
- movdqa [esi+32], xmm0 ;H
- movdqa [esi+48], xmm1
- movd ecx, xmm4 ;dc
- add ecx, 16 ;(sum+16)
- shr ecx, 5 ;((sum+16)>>5)
- shl ecx, 4 ;
- movd mm4, ecx ; mm4 copy DC
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
- mov edi, 0
-.loop16x16_get_satd:
-.loopStart1:
- SSE41_I16x16GetX38x4Satd ecx, edi
- inc ecx
- cmp ecx, 4
- jl .loopStart1
- cmp edi, 16
- je .loop16x16_get_satd_end
- mov eax, [esp+24]
- add eax, 8
- mov ecx, 0
- add edi, 16
- jmp .loop16x16_get_satd
- .loop16x16_get_satd_end:
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
-
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ebx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_16x16
- cmp ebx, ecx
- jge near not_dc_h_16x16
-
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_16x16_x3
-not_dc_16x16:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_16x16
- mov dword[edx], 1;I16_PRED_H
- mov eax, edi
- jmp near return_satd_intra_16x16_x3
-not_dc_h_16x16:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
-return_satd_intra_16x16_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-%macro SSE41_ChromaGetX38x8Satd 0
- movdqa xmm5, [HSumSubDB1]
- movdqa xmm6, [HSumSubDW1]
- movdqa xmm7, [PDW1]
- sub ecx, edx
- movq xmm0, [ecx]
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
- movdqa [esi], xmm0 ;V
- add ecx, edx
- pinsrb xmm0, byte[ecx-1], 0
- pinsrb xmm0, byte[ecx+edx-1], 1
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 2
- pinsrb xmm0, byte[ecx+edx-1], 3
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 4
- pinsrb xmm0, byte[ecx+edx-1], 5
- lea ecx, [ecx+2*edx]
- pinsrb xmm0, byte[ecx-1], 6
- pinsrb xmm0, byte[ecx+edx-1], 7
- punpcklqdq xmm0, xmm0
- SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
- movdqa [esi+16], xmm0 ;H
-;(sum+2)>>2
- movdqa xmm6, [PDQ2]
- movdqa xmm5, xmm4
- punpckhqdq xmm5, xmm1
- paddd xmm5, xmm6
- psrld xmm5, 2
-;(sum1+sum2+4)>>3
- paddd xmm6, xmm6
- paddd xmm4, xmm1
- paddd xmm4, xmm6
- psrld xmm4, 3
-;satd *16
- pslld xmm5, 4
- pslld xmm4, 4
-;temp satd
- movdqa xmm6, xmm4
- punpcklqdq xmm4, xmm5
- psllq xmm4, 32
- psrlq xmm4, 32
- movdqa [esi+32], xmm4
- punpckhqdq xmm5, xmm6
- psllq xmm5, 32
- psrlq xmm5, 32
- movdqa [esi+48], xmm5
-
- pxor xmm4, xmm4 ;V
- pxor xmm5, xmm5 ;H
- pxor xmm6, xmm6 ;DC
- mov ecx, 0
-loop_chroma_satdx3_cb_cr:
- SSE41_ChromaGetX38x4Satd ecx, 0
- inc ecx
- cmp ecx, 2
- jl loop_chroma_satdx3_cb_cr
-%endmacro
-
-%macro SSEReg2MMX 3
- movdq2q %2, %1
- movhlps %1, %1
- movdq2q %3, %1
-%endmacro
-%macro MMXReg2SSE 4
- movq2dq %1, %3
- movq2dq %2, %4
- punpcklqdq %1, %2
-%endmacro
-;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
-
-WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
-WelsIntraChroma8x8Combined3Satd_sse41:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov eax, [esp+24]
- mov ebx, [esp+28]
- mov esi, [esp+40] ;temp_satd
- xor edi, edi
-loop_chroma_satdx3:
- SSE41_ChromaGetX38x8Satd
- cmp edi, 1
- je loop_chroma_satdx3end
- inc edi
- SSEReg2MMX xmm4, mm0,mm1
- SSEReg2MMX xmm5, mm2,mm3
- SSEReg2MMX xmm6, mm5,mm6
- mov ecx, [esp+44]
- mov eax, [esp+48]
- jmp loop_chroma_satdx3
-loop_chroma_satdx3end:
- MMXReg2SSE xmm0, xmm3, mm0, mm1
- MMXReg2SSE xmm1, xmm3, mm2, mm3
- MMXReg2SSE xmm2, xmm3, mm5, mm6
-
- paddw xmm4, xmm0
- paddw xmm5, xmm1
- paddw xmm6, xmm2
-
- MMX_DW_1_2REG xmm0, xmm1
- psrlw xmm4, 1 ;/2
- psrlw xmm5, 1 ;/2
- psrlw xmm6, 1 ;/2
- SSE41_HSum8W xmm4, xmm0, xmm1
- SSE41_HSum8W xmm5, xmm0, xmm1
- SSE41_HSum8W xmm6, xmm0, xmm1
- ; comparing order: DC H V
- movd ebx, xmm6 ;DC
- movd edi, xmm5 ;H
- movd ecx, xmm4 ;V
- mov edx, [esp+36]
- shl edx, 1
- add edi, edx
- add ecx, edx
- mov edx, [esp+32]
- cmp ebx, edi
- jge near not_dc_8x8
- cmp ebx, ecx
- jge near not_dc_h_8x8
-
- ; for DC mode
- mov dword[edx], 0;I8_PRED_DC
- mov eax, ebx
- jmp near return_satd_intra_8x8_x3
-not_dc_8x8:
- ; for H mode
- cmp edi, ecx
- jge near not_dc_h_8x8
- mov dword[edx], 1;I8_PRED_H
- mov eax, edi
- jmp near return_satd_intra_8x8_x3
-not_dc_h_8x8:
- ; for V mode
- mov dword[edx], 2;I8_PRED_V
- mov eax, ecx
-return_satd_intra_8x8_x3:
- WELSEMMS
- pop edi
- pop esi
- pop ebx
-ret
-
-
-;***********************************************************************
-;
-;Pixel_satd_intra_sse2 END
-;
-;***********************************************************************
-%macro SSSE3_Get16BSadHVDC 2
- movd xmm6,%1
- pshufb xmm6,xmm1
- movdqa %1, xmm6
- movdqa xmm0,%2
- psadbw xmm0,xmm7
- paddw xmm4,xmm0
- movdqa xmm0,%2
- psadbw xmm0,xmm5
- paddw xmm2,xmm0
- psadbw xmm6,%2
- paddw xmm3,xmm6
-%endmacro
-%macro WelsAddDCValue 4
- movzx %2, byte %1
- mov %3, %2
- add %4, %2
-%endmacro
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 BEGIN
-;
-;***********************************************************************
-WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
-WelsIntra16x16Combined3Sad_ssse3:
- push ebx
- push esi
- push edi
- mov ecx, [esp+16]
- mov edx, [esp+20]
- mov edi, [esp+40] ;temp_sad
- sub ecx, edx
- movdqa xmm5,[ecx]
- pxor xmm0,xmm0
- psadbw xmm0,xmm5
- movhlps xmm1,xmm0
- paddw xmm0,xmm1
- movd eax,xmm0
-
- add ecx,edx
- lea ebx, [edx+2*edx]
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- lea ecx, [ecx+4*edx]
- add edi, 64
- WelsAddDCValue [ecx-1 ], esi, [edi ], eax
- WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
- WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
- WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
- sub edi, 192
- add eax,10h
- shr eax,5
- movd xmm7,eax
- pxor xmm1,xmm1
- pshufb xmm7,xmm1
- pxor xmm4,xmm4
- pxor xmm3,xmm3
- pxor xmm2,xmm2
-;sad begin
- mov eax, [esp+24]
- mov ebx, [esp+28]
- lea esi, [ebx+2*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
- add edi, 64
- lea eax, [eax+4*ebx]
- SSSE3_Get16BSadHVDC [edi], [eax]
- SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
- SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
- SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-
- pslldq xmm3,4
- por xmm3,xmm2
- movhlps xmm1,xmm3
- paddw xmm3,xmm1
- movhlps xmm0,xmm4
- paddw xmm4,xmm0
-; comparing order: DC H V
- movd ebx, xmm4 ;DC
- movd ecx, xmm3 ;V
- psrldq xmm3, 4
- movd esi, xmm3 ;H
- mov eax, [esp+36] ;lamda
- shl eax, 1
- add esi, eax
- add ebx, eax
- mov edx, [esp+32]
- cmp ebx, esi
- jge near not_dc_16x16_sad
- cmp ebx, ecx
- jge near not_dc_h_16x16_sad
- ; for DC mode
- mov dword[edx], 2;I16_PRED_DC
- mov eax, ebx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm7
-%assign x x+1
-%endrep
- jmp near return_sad_intra_16x16_x3
-not_dc_16x16_sad:
- ; for H mode
- cmp esi, ecx
- jge near not_dc_h_16x16_sad
- mov dword[edx], 1;I16_PRED_H
- mov eax, esi
- jmp near return_sad_intra_16x16_x3
-not_dc_h_16x16_sad:
- ; for V mode
- mov dword[edx], 0;I16_PRED_V
- mov eax, ecx
- sub edi, 192
-%assign x 0
-%rep 16
- movdqa [edi+16*x], xmm5
-%assign x x+1
-%endrep
-return_sad_intra_16x16_x3:
- pop edi
- pop esi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;Pixel_sad_intra_ssse3 END
-;
-;***********************************************************************
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 BEGIN
-;
-;***********************************************************************
-
-;SSE4.1
-%macro SSE41_GetSatd8x4 0
- movq xmm0, [eax]
- punpcklqdq xmm0, xmm0
- pmaddubsw xmm0, xmm7
- movq xmm1, [eax+ebx]
- punpcklqdq xmm1, xmm1
- pmaddubsw xmm1, xmm7
- movq xmm2, [ecx]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [ecx+edx]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- psubsw xmm0, xmm2
- psubsw xmm1, xmm3
- movq xmm2, [eax+2*ebx]
- punpcklqdq xmm2, xmm2
- pmaddubsw xmm2, xmm7
- movq xmm3, [eax+esi]
- punpcklqdq xmm3, xmm3
- pmaddubsw xmm3, xmm7
- movq xmm4, [ecx+2*edx]
- punpcklqdq xmm4, xmm4
- pmaddubsw xmm4, xmm7
- movq xmm5, [ecx+edi]
- punpcklqdq xmm5, xmm5
- pmaddubsw xmm5, xmm7
- psubsw xmm2, xmm4
- psubsw xmm3, xmm5
- SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
- pabsw xmm0, xmm0
- pabsw xmm2, xmm2
- pabsw xmm1, xmm1
- pabsw xmm3, xmm3
- movdqa xmm4, xmm3
- pblendw xmm3, xmm1, 0xAA
- pslld xmm1, 16
- psrld xmm4, 16
- por xmm1, xmm4
- pmaxuw xmm1, xmm3
- paddw xmm6, xmm1
- movdqa xmm4, xmm0
- pblendw xmm0, xmm2, 0xAA
- pslld xmm2, 16
- psrld xmm4, 16
- por xmm2, xmm4
- pmaxuw xmm0, xmm2
- paddw xmm6, xmm0
-%endmacro
-
-%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
- MMX_DW_1_2REG %3, %4
- pmaddwd %2, %3
- movhlps %4, %2
- paddd %2, %4
- pshuflw %4, %2,0Eh
- paddd %2, %4
- movd %1, %2
-%endmacro
-;***********************************************************************
-;
-;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd4x4_sse41
-WelsSampleSatd4x4_sse41:
- push ebx
- mov eax,[esp+8]
- mov ebx,[esp+12]
- mov ecx,[esp+16]
- mov edx,[esp+20]
- movdqa xmm4,[HSwapSumSubDB1]
- movd xmm2,[ecx]
- movd xmm5,[ecx+edx]
- shufps xmm2,xmm5,0
- movd xmm3,[ecx+edx*2]
- lea ecx, [edx*2+ecx]
- movd xmm5,[ecx+edx]
- shufps xmm3,xmm5,0
- movd xmm0,[eax]
- movd xmm5,[eax+ebx]
- shufps xmm0,xmm5,0
- movd xmm1,[eax+ebx*2]
- lea eax, [ebx*2+eax]
- movd xmm5,[eax+ebx]
- shufps xmm1,xmm5,0
- pmaddubsw xmm0,xmm4
- pmaddubsw xmm1,xmm4
- pmaddubsw xmm2,xmm4
- pmaddubsw xmm3,xmm4
- psubw xmm0,xmm2
- psubw xmm1,xmm3
- movdqa xmm2,xmm0
- paddw xmm0,xmm1
- psubw xmm1,xmm2
- movdqa xmm2,xmm0
- punpcklqdq xmm0,xmm1
- punpckhqdq xmm2,xmm1
- movdqa xmm1,xmm0
- paddw xmm0,xmm2
- psubw xmm2,xmm1
- movdqa xmm1,xmm0
- pblendw xmm0,xmm2,0AAh
- pslld xmm2,16
- psrld xmm1,16
- por xmm2,xmm1
- pabsw xmm0,xmm0
- pabsw xmm2,xmm2
- pmaxsw xmm0,xmm2
- SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x8_sse41
-align 16
-WelsSampleSatd8x8_sse41:
- push ebx
- push esi
- push edi
- mov eax, [esp+16]
- mov ebx, [esp+20]
- mov ecx, [esp+24]
- mov edx, [esp+28]
- movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
- pop edi
- pop esi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd8x16_sse41
-align 16
-WelsSampleSatd8x16_sse41:
- push ebx
- push esi
- push edi
- push ebp
-%define pushsize 16
- mov eax, [esp+pushsize+4]
- mov ebx, [esp+pushsize+8]
- mov ecx, [esp+pushsize+12]
- mov edx, [esp+pushsize+16]
- movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
- pxor xmm6, xmm6
- mov ebp, 0
-loop_get_satd_8x16:
- SSE41_GetSatd8x4
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- inc ebp
- cmp ebp, 4
- jl loop_get_satd_8x16
- SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSatd16x8_sse41
-align 16
-WelsSampleSatd16x8_sse41:
- push ebx
- push esi
- push edi
- mov eax, [esp+16]
- mov ebx, [esp+20]
- mov ecx, [esp+24]
- mov edx, [esp+28]
- movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
- pxor xmm6, xmm6
- SSE41_GetSatd8x4
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- SSE41_GetSatd8x4
- mov eax, [esp+16]
- mov ecx, [esp+24]
- add eax, 8
- add ecx, 8
- SSE41_GetSatd8x4
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- SSE41_GetSatd8x4
- SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
- pop edi
- pop esi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSatd16x16_sse41
-align 16
-WelsSampleSatd16x16_sse41:
- push ebx
- push esi
- push edi
- push ebp
- %define pushsize 16
- mov eax, [esp+pushsize+4]
- mov ebx, [esp+pushsize+8]
- mov ecx, [esp+pushsize+12]
- mov edx, [esp+pushsize+16]
- movdqa xmm7, [HSumSubDB1]
- lea esi, [ebx+ebx*2]
- lea edi, [edx+edx*2]
- pxor xmm6, xmm6
- mov ebp, 0
-loop_get_satd_16x16_left:
- SSE41_GetSatd8x4
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- inc ebp
- cmp ebp, 4
- jl loop_get_satd_16x16_left
- mov eax, [esp+pushsize+4]
- mov ecx, [esp+pushsize+12]
- add eax, 8
- add ecx, 8
- mov ebp, 0
-loop_get_satd_16x16_right:
- SSE41_GetSatd8x4
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- inc ebp
- cmp ebp, 4
- jl loop_get_satd_16x16_right
- SSSE3_SumWHorizon eax, xmm6, xmm5, xmm7
- %undef pushsize
- pop ebp
- pop edi
- pop esi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;Pixel_satd_wxh_sse41 END
-;
-;***********************************************************************
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-%macro SSE2_GetSad2x16 0
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqu xmm1, [ecx]
- MOVDQ xmm2, [eax];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
- movdqu xmm1, [ecx+edx]
- MOVDQ xmm2, [eax+ebx]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad4x16 0
- movdqu xmm0, [ecx]
- MOVDQ xmm2, [eax]
- psadbw xmm0, xmm2
- paddw xmm7, xmm0
- movdqu xmm1, [ecx+edx]
- MOVDQ xmm2, [eax+ebx]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [ecx+2*edx]
- MOVDQ xmm2, [eax+2*ebx];[eax] must aligned 16
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
- movdqu xmm1, [ecx+edi]
- MOVDQ xmm2, [eax+esi]
- psadbw xmm1, xmm2
- paddw xmm7, xmm1
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movhps xmm0, [eax]
- movhps xmm1, [eax+ebx]
-
- movq xmm2, [ecx]
- movq xmm3, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movhps xmm2, [ecx]
- movhps xmm3, [ecx+edx]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x16_sse2
-align 16
-WelsSampleSad16x16_sse2:
- push ebx
- push edi
- push esi
-
- %define _STACK_SIZE 12
-
- mov eax, [esp+_STACK_SIZE+4 ]
- mov ebx, [esp+_STACK_SIZE+8 ]
- lea esi, [3*ebx]
- mov ecx, [esp+_STACK_SIZE+12]
- mov edx, [esp+_STACK_SIZE+16]
- lea edi, [3*edx]
-
- pxor xmm7, xmm7
- SSE2_GetSad4x16
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- SSE2_GetSad4x16
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- SSE2_GetSad4x16
- lea eax, [eax+4*ebx]
- lea ecx, [ecx+4*edx]
- SSE2_GetSad4x16
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd eax, xmm0
-
- %undef _STACK_SIZE
-
- pop esi
- pop edi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
-;First parameter can align to 16 bytes,
-;In wels, the third parameter can't align to 16 bytes.
-;
-;***********************************************************************
-WELS_EXTERN WelsSampleSad16x8_sse2
-align 16
-WelsSampleSad16x8_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- movdqu xmm0, [ecx]
- MOVDQ xmm2, [eax]
- psadbw xmm0, xmm2
- movdqu xmm1, [ecx+edx]
- MOVDQ xmm2, [eax+ebx]
- psadbw xmm1, xmm2
- paddw xmm0, xmm1
-
- SSE2_GetSad2x16
- SSE2_GetSad2x16
- SSE2_GetSad2x16
-
- movhlps xmm1, xmm0
- paddw xmm0, xmm1
- movd eax, xmm0
- pop ebx
- ret
-
-
-
-WELS_EXTERN WelsSampleSad8x16_sse2
-WelsSampleSad8x16_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm6, xmm6
-
- SSE2_GetSad8x4
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
-
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd eax, xmm0
- pop ebx
- ret
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
- mov ecx, [esp+12]
- mov edx, ecx
- CACHE_SPLIT_CHECK edx, 8, 64
- jle near .pixel_sad_8x8_nsplit
- push ebx
- push edi
- mov eax, [esp+12]
- mov ebx, [esp+16]
-
- pxor xmm7, xmm7
-
- mov edi, ecx
- and edi, 0x07
- sub ecx, edi
- mov edx, 8
- sub edx, edi
-
- shl edi, 3
- shl edx, 3
- movd xmm5, edi
- movd xmm6, edx
- mov edi, 8
- add edi, ecx
- mov edx, [esp+24]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- lea edi, [edi+2*edx]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- lea edi, [edi+2*edx]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- lea edi, [edi+2*edx]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd eax, xmm0
- pop edi
- jmp .return
-.pixel_sad_8x8_nsplit:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov edx, [esp+20]
- pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd eax, xmm0
-.return:
- pop ebx
- ret
-
-
-;***********************************************************************
-;
-;Pixel_sad_wxh_sse2 END
-;
-;***********************************************************************
-
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 BEGIN
-;
-;***********************************************************************
-
-
-%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
- psadbw %1, %4
- paddw xmm5, %1
- psadbw %4, %3
- paddw xmm4, %4
- movdqu %4, [%5-1]
- psadbw %4, %2
- paddw xmm6, %4
- movdqu %4, [%5+1]
- psadbw %4, %2
- paddw xmm7, %4
-%endmacro
-WELS_EXTERN WelsSampleSadFour16x16_sse2
-WelsSampleSadFour16x16_sse2:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov ecx, [esp+16]
- mov edx, [esp+20]
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [eax]
- sub ecx, edx
- movdqu xmm3, [ecx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [ecx+edx-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [ecx+edx+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm2, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
- movdqa xmm0, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm1, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
- movdqa xmm2, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm0, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
- movdqa xmm1, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm2, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
- movdqa xmm0, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm1, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx
- movdqa xmm2, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx+edx
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm0, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx
- movdqa xmm1, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, ecx+edx
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- movdqa xmm2, [eax]
- movdqu xmm3, [ecx]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, ecx
- movdqa xmm0, [eax+ebx]
- movdqu xmm3, [ecx+edx]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, ecx+edx
- lea ecx, [ecx+2*edx]
- movdqu xmm3, [ecx]
- psadbw xmm2, xmm3
- paddw xmm5, xmm2
-
- movdqu xmm2, [ecx-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [ecx+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movdqu xmm3, [ecx+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- mov ecx, [esp+24]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [ecx],xmm4
- pop ebx
- ret
-
-
-WELS_EXTERN WelsSampleSadFour16x8_sse2
-WelsSampleSadFour16x8_sse2:
- push ebx
- push edi
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movdqa xmm0, [eax]
- sub edi, edx
- movdqu xmm3, [edi]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movdqa xmm1, [eax+ebx]
- movdqu xmm3, [edi+edx]
- psadbw xmm3, xmm1
- paddw xmm4, xmm3
-
- movdqu xmm2, [edi+edx-1]
- psadbw xmm2, xmm0
- paddw xmm6, xmm2
-
- movdqu xmm3, [edi+edx+1]
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movdqa xmm2, [eax]
- movdqu xmm3, [edi]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi
- movdqa xmm0, [eax+ebx]
- movdqu xmm3, [edi+edx]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi+edx
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movdqa xmm1, [eax]
- movdqu xmm3, [edi]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi
- movdqa xmm2, [eax+ebx]
- movdqu xmm3, [edi+edx]
- SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, edi+edx
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movdqa xmm0, [eax]
- movdqu xmm3, [edi]
- SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, edi
- movdqa xmm1, [eax+ebx]
- movdqu xmm3, [edi+edx]
- SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, edi+edx
- lea edi, [edi+2*edx]
- movdqu xmm3, [edi]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movdqu xmm0, [edi-1]
- psadbw xmm0, xmm1
- paddw xmm6, xmm0
-
- movdqu xmm3, [edi+1]
- psadbw xmm3, xmm1
- paddw xmm7, xmm3
-
- movdqu xmm3, [edi+edx]
- psadbw xmm1, xmm3
- paddw xmm5, xmm1
-
- mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [edi],xmm4
- pop edi
- pop ebx
- ret
-
-WELS_EXTERN WelsSampleSadFour8x16_sse2
-WelsSampleSadFour8x16_sse2:
- push ebx
- push edi
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- sub edi, edx
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [edi],xmm4
- pop edi
- pop ebx
- ret
-
-
-WELS_EXTERN WelsSampleSadFour8x8_sse2
-WelsSampleSadFour8x8_sse2:
- push ebx
- push edi
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
- pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
- pxor xmm6, xmm6 ;sad pRefMb-1
- pxor xmm7, xmm7 ;sad pRefMb+1
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- sub edi, edx
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
- psadbw xmm3, xmm0
- paddw xmm4, xmm3
-
-
- movq xmm1, [edi+edx-1]
- movq xmm3, [edi+edx+1]
-
- lea eax, [eax+2*ebx]
- lea edi, [edi+2*edx]
- movhps xmm1, [edi-1]
- movhps xmm3, [edi+1]
-
- psadbw xmm1, xmm0
- paddw xmm6, xmm1
- psadbw xmm3, xmm0
- paddw xmm7, xmm3
-
- movq xmm3, [edi]
- movhps xmm3, [edi+edx]
- psadbw xmm0, xmm3
- paddw xmm5, xmm0
-
- mov edi, [esp+28]
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- movhlps xmm0, xmm5
- paddw xmm5, xmm0
- movhlps xmm0, xmm6
- paddw xmm6, xmm0
- movhlps xmm0, xmm7
- paddw xmm7, xmm0
- punpckldq xmm4, xmm5
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6
- movdqa [edi],xmm4
- pop edi
- pop ebx
- ret
-
-WELS_EXTERN WelsSampleSadFour4x4_sse2
-WelsSampleSadFour4x4_sse2:
- push ebx
- push edi
- mov eax, [esp+12]
- mov ebx, [esp+16]
- mov edi, [esp+20]
- mov edx, [esp+24]
- movd xmm0, [eax]
- movd xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movd xmm2, [eax]
- movd xmm3, [eax+ebx]
- punpckldq xmm0, xmm1
- punpckldq xmm2, xmm3
- punpcklqdq xmm0, xmm2
- sub edi, edx
- movd xmm1, [edi]
- movd xmm2, [edi+edx]
- punpckldq xmm1, xmm2
- movd xmm2, [edi+edx-1]
- movd xmm3, [edi+edx+1]
-
- lea edi, [edi+2*edx]
-
- movd xmm4, [edi]
- movd xmm5, [edi-1]
- punpckldq xmm2, xmm5
- movd xmm5, [edi+1]
- punpckldq xmm3, xmm5
-
- movd xmm5, [edi+edx]
- punpckldq xmm4, xmm5
-
- punpcklqdq xmm1, xmm4 ;-L
-
- movd xmm5, [edi+edx-1]
- movd xmm6, [edi+edx+1]
-
- lea edi, [edi+2*edx]
- movd xmm7, [edi-1]
- punpckldq xmm5, xmm7
- punpcklqdq xmm2, xmm5 ;-1
- movd xmm7, [edi+1]
- punpckldq xmm6, xmm7
- punpcklqdq xmm3, xmm6 ;+1
- movd xmm6, [edi]
- movd xmm7, [edi+edx]
- punpckldq xmm6, xmm7
- punpcklqdq xmm4, xmm6 ;+L
- psadbw xmm1, xmm0
- psadbw xmm2, xmm0
- psadbw xmm3, xmm0
- psadbw xmm4, xmm0
-
- movhlps xmm0, xmm1
- paddw xmm1, xmm0
- movhlps xmm0, xmm2
- paddw xmm2, xmm0
- movhlps xmm0, xmm3
- paddw xmm3, xmm0
- movhlps xmm0, xmm4
- paddw xmm4, xmm0
- mov edi, [esp+28]
- punpckldq xmm1, xmm4
- punpckldq xmm2, xmm3
- punpcklqdq xmm1, xmm2
- movdqa [edi],xmm1
- pop edi
- pop ebx
- ret
-
-;***********************************************************************
-;
-;Pixel_sad_4_wxh_sse2 END
-;
-;***********************************************************************
-
-WELS_EXTERN WelsSampleSad4x4_mmx
-
-align 16
-;***********************************************************************
-; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
-;***********************************************************************
-WelsSampleSad4x4_mmx:
- push ebx
-%define pushsize 4
-%define pix1address esp+pushsize+4
-%define pix1stride esp+pushsize+8
-%define pix2address esp+pushsize+12
-%define pix2stride esp+pushsize+16
-
- mov eax, [pix1address]
- mov ebx, [pix1stride ]
- mov ecx, [pix2address]
- mov edx, [pix2stride ]
-
- movd mm0, [eax]
- movd mm1, [eax+ebx]
- punpckldq mm0, mm1
-
- movd mm3, [ecx]
- movd mm4, [ecx+edx]
- punpckldq mm3, mm4
- psadbw mm0, mm3
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
-
- movd mm1, [eax]
- movd mm2, [eax+ebx]
- punpckldq mm1, mm2
-
- movd mm3, [ecx]
- movd mm4, [ecx+edx]
- punpckldq mm3, mm4
- psadbw mm1, mm3
- paddw mm0, mm1
-
- movd eax, mm0
-
- WELSEMMS
- pop ebx
-%undef pushsize
-%undef pix1address
-%undef pix1stride
-%undef pix2address
-%undef pix2stride
- ret
\ No newline at end of file
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* satd_sad.asm
+;*
+;* Abstract
+;* WelsSampleSatd4x4_sse2
+;* WelsSampleSatd8x8_sse2
+;* WelsSampleSatd16x8_sse2
+;* WelsSampleSatd8x16_sse2
+;* WelsSampleSatd16x16_sse2
+;*
+;* WelsSampleSad16x8_sse2
+;* WelsSampleSad16x16_sse2
+;*
+;* History
+;* 8/5/2009 Created
+;* 24/9/2009 modified
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Data
+;***********************************************************************
+SECTION .rodata align=16
+
+align 16
+HSumSubDB1: db 1,1,1,1,1,1,1,1,1,-1,1,-1,1,-1,1,-1
+align 16
+HSumSubDW1: dw 1,-1,1,-1,1,-1,1,-1
+align 16
+PDW1: dw 1,1,1,1,1,1,1,1
+align 16
+PDQ2: dw 2,0,0,0,2,0,0,0
+align 16
+HSwapSumSubDB1: times 2 db 1, 1, 1, 1, 1, -1, 1, -1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 BEGIN
+;
+;***********************************************************************
+%macro MMX_DW_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubw %1, %2
+%endmacro
+
+%macro SSE2_SumWHorizon1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
+%endmacro
+
+%macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4 pOut: xmm4,xmm2,xmm1,xmm3
+ SSE2_SumSub %1, %2, %5
+ SSE2_SumSub %3, %4, %5
+ SSE2_SumSub %2, %4, %5
+ SSE2_SumSub %1, %3, %5
+%endmacro
+
+%macro SSE2_SumAbs4 7
+ WELS_AbsW %1, %3
+ WELS_AbsW %2, %3
+ WELS_AbsW %4, %6
+ WELS_AbsW %5, %6
+ paddusw %1, %2
+ paddusw %4, %5
+ paddusw %7, %1
+ paddusw %7, %4
+%endmacro
+
+%macro SSE2_SumWHorizon 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+%macro SSE2_GetSatd8x8 0
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm0,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm1,xmm5,xmm7,[r0+r1],[r2+r3]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_LoadDiff8P xmm2,xmm4,xmm7,[r0],[r2]
+ SSE2_LoadDiff8P xmm3,xmm5,xmm7,[r0+r1],[r2+r3]
+
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm4
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm4
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm4,xmm5
+ SSE2_SumAbs4 xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse2( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse2
+align 16
+WelsSampleSatd4x4_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0 , [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ movd xmm4, [r2]
+ movd xmm5, [r2+r3]
+ lea r2 , [r2+2*r3]
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm4, xmm6
+ punpckldq xmm5, xmm7
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+ punpcklbw xmm4, xmm6
+ punpcklbw xmm5, xmm6
+
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ WELS_AbsW xmm0, xmm3
+ paddusw xmm6, xmm0
+ WELS_AbsW xmm2, xmm4
+ paddusw xmm6, xmm2
+ SSE2_SumWHorizon1 xmm6, xmm4
+ movd retrd, xmm6
+ and retrd, 0xffff
+ shr retrd, 1
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x8_sse2
+align 16
+ WelsSampleSatd8x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+ SSE2_GetSatd8x8
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+ ;***********************************************************************
+ ;
+ ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+ ;
+ ;***********************************************************************
+ WELS_EXTERN WelsSampleSatd8x16_sse2
+align 16
+ WelsSampleSatd8x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse2
+align 16
+WelsSampleSatd16x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+8]
+ ;mov ecx, [esp+16]
+ add r0, 8
+ add r2, 8
+ SSE2_GetSatd8x8
+
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x16_sse2
+align 16
+WelsSampleSatd16x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+8]
+ ;mov ecx, [esp+16]
+ add r0, 8
+ add r2, 8
+
+ SSE2_GetSatd8x8
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSatd8x8
+
+ ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
+ psrlw xmm6, 1
+ SSE2_SumWHorizon xmm6,xmm4,xmm7
+ movd retrd, xmm6
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse2 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+ paddd xmm4, %1 ;for dc
+ paddd xmm4, %3 ;for dc
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
+ pmaddubsw %1, xmm5
+ movdqa %2, %1
+ pmaddwd %1, xmm7
+ pmaddwd %2, xmm6
+ movdqa %3, %1
+ punpckldq %1, %2
+ punpckhdq %2, %3
+ movdqa %3, %1
+ punpcklqdq %1, %2
+ punpckhqdq %3, %2
+; paddd xmm4, %1 ;for dc
+; paddd xmm4, %3 ;for dc
+ movdqa %4, %1
+ punpcklqdq %4, %3
+ packssdw %1, %3
+ psllw %1, 2
+%endmacro
+
+%macro SSE41_GetX38x4SatdDec 0
+ pxor xmm7, xmm7
+ movq xmm0, [eax]
+ movq xmm1, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ movq xmm2, [eax]
+ movq xmm3, [eax+ebx]
+ lea eax, [eax+2*ebx]
+ punpcklbw xmm0, xmm7
+ punpcklbw xmm1, xmm7
+ punpcklbw xmm2, xmm7
+ punpcklbw xmm3, xmm7
+ SSE2_HDMTwo4x4 xmm0,xmm1,xmm2,xmm3,xmm7
+ SSE2_TransTwo4x4W xmm3,xmm1,xmm0,xmm2,xmm7
+ SSE2_HDMTwo4x4 xmm3,xmm1,xmm2,xmm7,xmm0 ;pOut xmm7,xmm1,xmm3,xmm2
+ ;doesn't need another transpose
+%endmacro
+%macro SSE41_GetX38x4SatdV 2
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2], 0
+ pinsrw xmm0, word[esi+%2+8], 4
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+2], 0
+ pinsrw xmm0, word[esi+%2+10], 4
+ psubsw xmm0, xmm1
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+4], 0
+ pinsrw xmm0, word[esi+%2+12], 4
+ psubsw xmm0, xmm3
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+ pxor xmm0, xmm0
+ pinsrw xmm0, word[esi+%2+6], 0
+ pinsrw xmm0, word[esi+%2+14], 4
+ psubsw xmm0, xmm2
+ pabsw xmm0, xmm0
+ paddw xmm4, xmm0
+%endmacro
+%macro SSE41_GetX38x4SatdH 3
+ movq xmm0, [esi+%3+8*%1]
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm5, xmm0
+ pabsw xmm1, xmm1
+ pabsw xmm2, xmm2
+ pabsw xmm3, xmm3
+ paddw xmm2, xmm1;for DC
+ paddw xmm2, xmm3;for DC
+ paddw xmm5, xmm2
+%endmacro
+%macro SSE41_I16X16GetX38x4SatdDC 0
+ pxor xmm0, xmm0
+ movq2dq xmm0, mm4
+ punpcklqdq xmm0, xmm0
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_ChromaGetX38x4SatdDC 1
+ shl %1, 4
+ movdqa xmm0, [esi+32+%1]
+ psubsw xmm0, xmm7
+ pabsw xmm0, xmm0
+ paddw xmm6, xmm0
+ paddw xmm6, xmm2
+%endmacro
+%macro SSE41_I16x16GetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 32
+ SSE41_I16X16GetX38x4SatdDC
+%endmacro
+%macro SSE41_ChromaGetX38x4Satd 2
+ SSE41_GetX38x4SatdDec
+ SSE41_GetX38x4SatdV %1, %2
+ SSE41_GetX38x4SatdH %1, %2, 16
+ SSE41_ChromaGetX38x4SatdDC %1
+%endmacro
+%macro SSE41_HSum8W 3
+ pmaddwd %1, %2
+ movhlps %3, %1
+ paddd %1, %3
+ pshuflw %3, %1,0Eh
+ paddd %1, %3
+%endmacro
+
+
+%ifdef X86_32
+WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
+WelsIntra16x16Combined3Satd_sse41:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ pxor xmm4, xmm4
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movdqu xmm0, [ecx]
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi], xmm0 ;V
+ movdqa [esi+16], xmm1
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 8
+ pinsrb xmm0, byte[ecx+edx-1], 9
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 10
+ pinsrb xmm0, byte[ecx+edx-1], 11
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 12
+ pinsrb xmm0, byte[ecx+edx-1], 13
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 14
+ pinsrb xmm0, byte[ecx+edx-1], 15
+ movhlps xmm1, xmm0
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
+ SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
+ movdqa [esi+32], xmm0 ;H
+ movdqa [esi+48], xmm1
+ movd ecx, xmm4 ;dc
+ add ecx, 16 ;(sum+16)
+ shr ecx, 5 ;((sum+16)>>5)
+ shl ecx, 4 ;
+ movd mm4, ecx ; mm4 copy DC
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+ mov edi, 0
+.loop16x16_get_satd:
+.loopStart1:
+ SSE41_I16x16GetX38x4Satd ecx, edi
+ inc ecx
+ cmp ecx, 4
+ jl .loopStart1
+ cmp edi, 16
+ je .loop16x16_get_satd_end
+ mov eax, [esp+24]
+ add eax, 8
+ mov ecx, 0
+ add edi, 16
+ jmp .loop16x16_get_satd
+ .loop16x16_get_satd_end:
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ebx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_16x16
+ cmp ebx, ecx
+ jge near not_dc_h_16x16
+
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_16x16_x3
+not_dc_16x16:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_16x16
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_16x16_x3
+not_dc_h_16x16:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+return_satd_intra_16x16_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+%macro SSE41_ChromaGetX38x8Satd 0
+ movdqa xmm5, [HSumSubDB1]
+ movdqa xmm6, [HSumSubDW1]
+ movdqa xmm7, [PDW1]
+ sub ecx, edx
+ movq xmm0, [ecx]
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm4
+ movdqa [esi], xmm0 ;V
+ add ecx, edx
+ pinsrb xmm0, byte[ecx-1], 0
+ pinsrb xmm0, byte[ecx+edx-1], 1
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 2
+ pinsrb xmm0, byte[ecx+edx-1], 3
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 4
+ pinsrb xmm0, byte[ecx+edx-1], 5
+ lea ecx, [ecx+2*edx]
+ pinsrb xmm0, byte[ecx-1], 6
+ pinsrb xmm0, byte[ecx+edx-1], 7
+ punpcklqdq xmm0, xmm0
+ SSE41_ChromaGet8WSumSub xmm0, xmm2, xmm3, xmm1
+ movdqa [esi+16], xmm0 ;H
+;(sum+2)>>2
+ movdqa xmm6, [PDQ2]
+ movdqa xmm5, xmm4
+ punpckhqdq xmm5, xmm1
+ paddd xmm5, xmm6
+ psrld xmm5, 2
+;(sum1+sum2+4)>>3
+ paddd xmm6, xmm6
+ paddd xmm4, xmm1
+ paddd xmm4, xmm6
+ psrld xmm4, 3
+;satd *16
+ pslld xmm5, 4
+ pslld xmm4, 4
+;temp satd
+ movdqa xmm6, xmm4
+ punpcklqdq xmm4, xmm5
+ psllq xmm4, 32
+ psrlq xmm4, 32
+ movdqa [esi+32], xmm4
+ punpckhqdq xmm5, xmm6
+ psllq xmm5, 32
+ psrlq xmm5, 32
+ movdqa [esi+48], xmm5
+
+ pxor xmm4, xmm4 ;V
+ pxor xmm5, xmm5 ;H
+ pxor xmm6, xmm6 ;DC
+ mov ecx, 0
+loop_chroma_satdx3_cb_cr:
+ SSE41_ChromaGetX38x4Satd ecx, 0
+ inc ecx
+ cmp ecx, 2
+ jl loop_chroma_satdx3_cb_cr
+%endmacro
+
+%macro SSEReg2MMX 3
+ movdq2q %2, %1
+ movhlps %1, %1
+ movdq2q %3, %1
+%endmacro
+%macro MMXReg2SSE 4
+ movq2dq %1, %3
+ movq2dq %2, %4
+ punpcklqdq %1, %2
+%endmacro
+;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
+
+WELS_EXTERN WelsIntraChroma8x8Combined3Satd_sse41
+WelsIntraChroma8x8Combined3Satd_sse41:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ mov esi, [esp+40] ;temp_satd
+ xor edi, edi
+loop_chroma_satdx3:
+ SSE41_ChromaGetX38x8Satd
+ cmp edi, 1
+ je loop_chroma_satdx3end
+ inc edi
+ SSEReg2MMX xmm4, mm0,mm1
+ SSEReg2MMX xmm5, mm2,mm3
+ SSEReg2MMX xmm6, mm5,mm6
+ mov ecx, [esp+44]
+ mov eax, [esp+48]
+ jmp loop_chroma_satdx3
+loop_chroma_satdx3end:
+ MMXReg2SSE xmm0, xmm3, mm0, mm1
+ MMXReg2SSE xmm1, xmm3, mm2, mm3
+ MMXReg2SSE xmm2, xmm3, mm5, mm6
+
+ paddw xmm4, xmm0
+ paddw xmm5, xmm1
+ paddw xmm6, xmm2
+
+ MMX_DW_1_2REG xmm0, xmm1
+ psrlw xmm4, 1 ;/2
+ psrlw xmm5, 1 ;/2
+ psrlw xmm6, 1 ;/2
+ SSE41_HSum8W xmm4, xmm0, xmm1
+ SSE41_HSum8W xmm5, xmm0, xmm1
+ SSE41_HSum8W xmm6, xmm0, xmm1
+ ; comparing order: DC H V
+ movd ebx, xmm6 ;DC
+ movd edi, xmm5 ;H
+ movd ecx, xmm4 ;V
+ mov edx, [esp+36]
+ shl edx, 1
+ add edi, edx
+ add ecx, edx
+ mov edx, [esp+32]
+ cmp ebx, edi
+ jge near not_dc_8x8
+ cmp ebx, ecx
+ jge near not_dc_h_8x8
+
+ ; for DC mode
+ mov dword[edx], 0;I8_PRED_DC
+ mov eax, ebx
+ jmp near return_satd_intra_8x8_x3
+not_dc_8x8:
+ ; for H mode
+ cmp edi, ecx
+ jge near not_dc_h_8x8
+ mov dword[edx], 1;I8_PRED_H
+ mov eax, edi
+ jmp near return_satd_intra_8x8_x3
+not_dc_h_8x8:
+ ; for V mode
+ mov dword[edx], 2;I8_PRED_V
+ mov eax, ecx
+return_satd_intra_8x8_x3:
+ WELSEMMS
+ pop edi
+ pop esi
+ pop ebx
+ret
+
+
+;***********************************************************************
+;
+;Pixel_satd_intra_sse2 END
+;
+;***********************************************************************
+%macro SSSE3_Get16BSadHVDC 2
+ movd xmm6,%1
+ pshufb xmm6,xmm1
+ movdqa %1, xmm6
+ movdqa xmm0,%2
+ psadbw xmm0,xmm7
+ paddw xmm4,xmm0
+ movdqa xmm0,%2
+ psadbw xmm0,xmm5
+ paddw xmm2,xmm0
+ psadbw xmm6,%2
+ paddw xmm3,xmm6
+%endmacro
+%macro WelsAddDCValue 4
+ movzx %2, byte %1
+ mov %3, %2
+ add %4, %2
+%endmacro
+
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 BEGIN
+;
+;***********************************************************************
+WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
+WelsIntra16x16Combined3Sad_ssse3:
+ push ebx
+ push esi
+ push edi
+ mov ecx, [esp+16]
+ mov edx, [esp+20]
+ mov edi, [esp+40] ;temp_sad
+ sub ecx, edx
+ movdqa xmm5,[ecx]
+ pxor xmm0,xmm0
+ psadbw xmm0,xmm5
+ movhlps xmm1,xmm0
+ paddw xmm0,xmm1
+ movd eax,xmm0
+
+ add ecx,edx
+ lea ebx, [edx+2*edx]
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ lea ecx, [ecx+4*edx]
+ add edi, 64
+ WelsAddDCValue [ecx-1 ], esi, [edi ], eax
+ WelsAddDCValue [ecx-1+edx ], esi, [edi+16], eax
+ WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
+ WelsAddDCValue [ecx-1+ebx ], esi, [edi+48], eax
+ sub edi, 192
+ add eax,10h
+ shr eax,5
+ movd xmm7,eax
+ pxor xmm1,xmm1
+ pshufb xmm7,xmm1
+ pxor xmm4,xmm4
+ pxor xmm3,xmm3
+ pxor xmm2,xmm2
+;sad begin
+ mov eax, [esp+24]
+ mov ebx, [esp+28]
+ lea esi, [ebx+2*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+ add edi, 64
+ lea eax, [eax+4*ebx]
+ SSSE3_Get16BSadHVDC [edi], [eax]
+ SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
+ SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
+ SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
+
+ pslldq xmm3,4
+ por xmm3,xmm2
+ movhlps xmm1,xmm3
+ paddw xmm3,xmm1
+ movhlps xmm0,xmm4
+ paddw xmm4,xmm0
+; comparing order: DC H V
+ movd ebx, xmm4 ;DC
+ movd ecx, xmm3 ;V
+ psrldq xmm3, 4
+ movd esi, xmm3 ;H
+ mov eax, [esp+36] ;lamda
+ shl eax, 1
+ add esi, eax
+ add ebx, eax
+ mov edx, [esp+32]
+ cmp ebx, esi
+ jge near not_dc_16x16_sad
+ cmp ebx, ecx
+ jge near not_dc_h_16x16_sad
+ ; for DC mode
+ mov dword[edx], 2;I16_PRED_DC
+ mov eax, ebx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm7
+%assign x x+1
+%endrep
+ jmp near return_sad_intra_16x16_x3
+not_dc_16x16_sad:
+ ; for H mode
+ cmp esi, ecx
+ jge near not_dc_h_16x16_sad
+ mov dword[edx], 1;I16_PRED_H
+ mov eax, esi
+ jmp near return_sad_intra_16x16_x3
+not_dc_h_16x16_sad:
+ ; for V mode
+ mov dword[edx], 0;I16_PRED_V
+ mov eax, ecx
+ sub edi, 192
+%assign x 0
+%rep 16
+ movdqa [edi+16*x], xmm5
+%assign x x+1
+%endrep
+return_sad_intra_16x16_x3:
+ pop edi
+ pop esi
+ pop ebx
+ ret
+%endif
+;***********************************************************************
+;
+;Pixel_sad_intra_ssse3 END
+;
+;***********************************************************************
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 BEGIN
+;
+;***********************************************************************
+
+;SSE4.1
+%macro SSE41_GetSatd8x4 0
+ movq xmm0, [r0]
+ punpcklqdq xmm0, xmm0
+ pmaddubsw xmm0, xmm7
+ movq xmm1, [r0+r1]
+ punpcklqdq xmm1, xmm1
+ pmaddubsw xmm1, xmm7
+ movq xmm2, [r2]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r2+r3]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ psubsw xmm0, xmm2
+ psubsw xmm1, xmm3
+ movq xmm2, [r0+2*r1]
+ punpcklqdq xmm2, xmm2
+ pmaddubsw xmm2, xmm7
+ movq xmm3, [r0+r4]
+ punpcklqdq xmm3, xmm3
+ pmaddubsw xmm3, xmm7
+ movq xmm4, [r2+2*r3]
+ punpcklqdq xmm4, xmm4
+ pmaddubsw xmm4, xmm7
+ movq xmm5, [r2+r5]
+ punpcklqdq xmm5, xmm5
+ pmaddubsw xmm5, xmm7
+ psubsw xmm2, xmm4
+ psubsw xmm3, xmm5
+ SSE2_HDMTwo4x4 xmm0, xmm1, xmm2, xmm3, xmm4
+ pabsw xmm0, xmm0
+ pabsw xmm2, xmm2
+ pabsw xmm1, xmm1
+ pabsw xmm3, xmm3
+ movdqa xmm4, xmm3
+ pblendw xmm3, xmm1, 0xAA
+ pslld xmm1, 16
+ psrld xmm4, 16
+ por xmm1, xmm4
+ pmaxuw xmm1, xmm3
+ paddw xmm6, xmm1
+ movdqa xmm4, xmm0
+ pblendw xmm0, xmm2, 0xAA
+ pslld xmm2, 16
+ psrld xmm4, 16
+ por xmm2, xmm4
+ pmaxuw xmm0, xmm2
+ paddw xmm6, xmm0
+%endmacro
+
+%macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
+ MMX_DW_1_2REG %3, %4
+ pmaddwd %2, %3
+ movhlps %4, %2
+ paddd %2, %4
+ pshuflw %4, %2,0Eh
+ paddd %2, %4
+ movd %1, %2
+%endmacro
+;***********************************************************************
+;
+;int32_t WelsSampleSatd4x4_sse41( uint8_t *, int32_t, uint8_t *, int32_t );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd4x4_sse41
+WelsSampleSatd4x4_sse41:
+ ;push ebx
+ ;mov eax,[esp+8]
+ ;mov ebx,[esp+12]
+ ;mov ecx,[esp+16]
+ ;mov edx,[esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm4,[HSwapSumSubDB1]
+ movd xmm2,[r2]
+ movd xmm5,[r2+r3]
+ shufps xmm2,xmm5,0
+ movd xmm3,[r2+r3*2]
+ lea r2, [r3*2+r2]
+ movd xmm5,[r2+r3]
+ shufps xmm3,xmm5,0
+ movd xmm0,[r0]
+ movd xmm5,[r0+r1]
+ shufps xmm0,xmm5,0
+ movd xmm1,[r0+r1*2]
+ lea r0, [r1*2+r0]
+ movd xmm5,[r0+r1]
+ shufps xmm1,xmm5,0
+ pmaddubsw xmm0,xmm4
+ pmaddubsw xmm1,xmm4
+ pmaddubsw xmm2,xmm4
+ pmaddubsw xmm3,xmm4
+ psubw xmm0,xmm2
+ psubw xmm1,xmm3
+ movdqa xmm2,xmm0
+ paddw xmm0,xmm1
+ psubw xmm1,xmm2
+ movdqa xmm2,xmm0
+ punpcklqdq xmm0,xmm1
+ punpckhqdq xmm2,xmm1
+ movdqa xmm1,xmm0
+ paddw xmm0,xmm2
+ psubw xmm2,xmm1
+ movdqa xmm1,xmm0
+ pblendw xmm0,xmm2,0AAh
+ pslld xmm2,16
+ psrld xmm1,16
+ por xmm2,xmm1
+ pabsw xmm0,xmm0
+ pabsw xmm2,xmm2
+ pmaxsw xmm0,xmm2
+ SSSE3_SumWHorizon retrd, xmm0, xmm5, xmm7
+ LOAD_4_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x8_sse41
+align 16
+WelsSampleSatd8x8_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;mov eax, [esp+16]
+ ;mov ebx, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd8x16_sse41
+align 16
+WelsSampleSatd8x16_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;%define pushsize 16
+ ;mov eax, [esp+pushsize+4]
+ ;mov ebx, [esp+pushsize+8]
+ ;mov ecx, [esp+pushsize+12]
+ ;mov edx, [esp+pushsize+16]
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_8x16:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_8x16
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSatd16x8_sse41
+align 16
+WelsSampleSatd16x8_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;mov eax, [esp+16]
+ ;mov ebx, [esp+20]
+ ;mov ecx, [esp+24]
+ ;mov edx, [esp+28]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+16]
+ ;mov ecx, [esp+24]
+ add r0, 8
+ add r2, 8
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE41_GetSatd8x4
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSatd16x16_sse41
+align 16
+WelsSampleSatd16x16_sse41:
+ ;push ebx
+ ;push esi
+ ;push edi
+ ;push ebp
+ ;%define pushsize 16
+ ;mov eax, [esp+pushsize+4]
+ ;mov ebx, [esp+pushsize+8]
+ ;mov ecx, [esp+pushsize+12]
+ ;mov edx, [esp+pushsize+16]
+%ifdef X86_32
+ push r4
+ push r5
+ push r6
+%endif
+ %assign push_num 3
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+
+ push r0
+ push r2
+
+ movdqa xmm7, [HSumSubDB1]
+ lea r4, [r1+r1*2]
+ lea r5, [r3+r3*2]
+ pxor xmm6, xmm6
+ mov r6, 0
+loop_get_satd_16x16_left:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_left
+
+ pop r2
+ pop r0
+ ;mov eax, [esp+pushsize+4]
+ ;mov ecx, [esp+pushsize+12]
+ add r0, 8
+ add r2, 8
+ mov r6, 0
+loop_get_satd_16x16_right:
+ SSE41_GetSatd8x4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ inc r6
+ cmp r6, 4
+ jl loop_get_satd_16x16_right
+ SSSE3_SumWHorizon retrd, xmm6, xmm5, xmm7
+ ;%undef pushsize
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r6
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;Pixel_satd_wxh_sse41 END
+;
+;***********************************************************************
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+%macro SSE2_GetSad2x16 0
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqu xmm1, [r2]
+ MOVDQ xmm2, [r0];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad4x16 0
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ paddw xmm7, xmm0
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+2*r3]
+ MOVDQ xmm2, [r0+2*r1];[eax] must aligned 16
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+ movdqu xmm1, [r2+r5]
+ MOVDQ xmm2, [r0+r4]
+ psadbw xmm1, xmm2
+ paddw xmm7, xmm1
+%endmacro
+
+
+%macro SSE2_GetSad8x4 0
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
+
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x16_sse2
+align 16
+WelsSampleSad16x16_sse2:
+ ;push ebx
+ ;push edi
+ ;push esi
+ ;%define _STACK_SIZE 12
+ ;mov eax, [esp+_STACK_SIZE+4 ]
+ ;mov ebx, [esp+_STACK_SIZE+8 ]
+ ;mov ecx, [esp+_STACK_SIZE+12]
+ ;mov edx, [esp+_STACK_SIZE+16]
+%ifdef X86_32
+ push r4
+ push r5
+%endif
+
+ %assign push_num 2
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+
+ pxor xmm7, xmm7
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ SSE2_GetSad4x16
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+%ifdef X86_32
+ pop r5
+ pop r4
+%endif
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
+;First parameter can align to 16 bytes,
+;In wels, the third parameter can't align to 16 bytes.
+;
+;***********************************************************************
+WELS_EXTERN WelsSampleSad16x8_sse2
+align 16
+WelsSampleSad16x8_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movdqu xmm0, [r2]
+ MOVDQ xmm2, [r0]
+ psadbw xmm0, xmm2
+ movdqu xmm1, [r2+r3]
+ MOVDQ xmm2, [r0+r1]
+ psadbw xmm1, xmm2
+ paddw xmm0, xmm1
+
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+ SSE2_GetSad2x16
+
+ movhlps xmm1, xmm0
+ paddw xmm0, xmm1
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+
+WELS_EXTERN WelsSampleSad8x16_sse2
+WelsSampleSad8x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+ ret
+
+
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+ ;mov ecx, [esp+12]
+ ;mov edx, ecx
+ ;CACHE_SPLIT_CHECK edx, 8, 64
+ ;jle near .pixel_sad_8x8_nsplit
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+%endif
+ %assign push_num 3
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENTION r1, r1d
+ pxor xmm7, xmm7
+
+ ;ecx r2, edx r4, edi r5
+
+ mov r5, r2
+ and r5, 0x07
+ sub r2, r5
+ mov r4, 8
+ sub r4, r5
+
+ shl r5, 3
+ shl r4, 3
+ movd xmm5, r5d
+ movd xmm6, r4d
+ mov r5, 8
+ add r5, r2
+ mov r3, arg4
+ SIGN_EXTENTION r3, r3d
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+%ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+%endif
+ jmp .return
+
+.pixel_sad_8x8_nsplit:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov edx, [esp+20]
+
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+.return:
+ ret
+
+
+;***********************************************************************
+;
+;Pixel_sad_wxh_sse2 END
+;
+;***********************************************************************
+
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 BEGIN
+;
+;***********************************************************************
+
+
+%macro SSE2_Get4LW16Sad 5 ;s-1l, s, s+1l, d, address
+ psadbw %1, %4
+ paddw xmm5, %1
+ psadbw %4, %3
+ paddw xmm4, %4
+ movdqu %4, [%5-1]
+ psadbw %4, %2
+ paddw xmm6, %4
+ movdqu %4, [%5+1]
+ psadbw %4, %2
+ paddw xmm7, %4
+%endmacro
+WELS_EXTERN WelsSampleSadFour16x16_sse2
+WelsSampleSadFour16x16_sse2:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov ecx, [esp+16]
+ ;mov edx, [esp+20]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm2, xmm3
+ paddw xmm5, xmm2
+
+ movdqu xmm2, [r2-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov ecx, [esp+24]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour16x8_sse2
+WelsSampleSadFour16x8_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movdqa xmm0, [r0]
+ sub r2, r3
+ movdqu xmm3, [r2]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ psadbw xmm3, xmm1
+ paddw xmm4, xmm3
+
+ movdqu xmm2, [r2+r3-1]
+ psadbw xmm2, xmm0
+ paddw xmm6, xmm2
+
+ movdqu xmm3, [r2+r3+1]
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm2, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2
+ movdqa xmm0, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm1, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2
+ movdqa xmm2, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm0, xmm1, xmm2, xmm3, r2+r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movdqa xmm0, [r0]
+ movdqu xmm3, [r2]
+ SSE2_Get4LW16Sad xmm1, xmm2, xmm0, xmm3, r2
+ movdqa xmm1, [r0+r1]
+ movdqu xmm3, [r2+r3]
+ SSE2_Get4LW16Sad xmm2, xmm0, xmm1, xmm3, r2+r3
+ lea r2, [r2+2*r3]
+ movdqu xmm3, [r2]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movdqu xmm0, [r2-1]
+ psadbw xmm0, xmm1
+ paddw xmm6, xmm0
+
+ movdqu xmm3, [r2+1]
+ psadbw xmm3, xmm1
+ paddw xmm7, xmm3
+
+ movdqu xmm3, [r2+r3]
+ psadbw xmm1, xmm3
+ paddw xmm5, xmm1
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour8x16_sse2
+WelsSampleSadFour8x16_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+
+WELS_EXTERN WelsSampleSadFour8x8_sse2
+WelsSampleSadFour8x8_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm4, xmm4 ;sad pRefMb-i_stride_ref
+ pxor xmm5, xmm5 ;sad pRefMb+i_stride_ref
+ pxor xmm6, xmm6 ;sad pRefMb-1
+ pxor xmm7, xmm7 ;sad pRefMb+1
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ sub r2, r3
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+ psadbw xmm3, xmm0
+ paddw xmm4, xmm3
+
+
+ movq xmm1, [r2+r3-1]
+ movq xmm3, [r2+r3+1]
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ movhps xmm1, [r2-1]
+ movhps xmm3, [r2+1]
+
+ psadbw xmm1, xmm0
+ paddw xmm6, xmm1
+ psadbw xmm3, xmm0
+ paddw xmm7, xmm3
+
+ movq xmm3, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm3
+ paddw xmm5, xmm0
+
+ ;mov edi, [esp+28]
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ movhlps xmm0, xmm5
+ paddw xmm5, xmm0
+ movhlps xmm0, xmm6
+ paddw xmm6, xmm0
+ movhlps xmm0, xmm7
+ paddw xmm7, xmm0
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6
+ movdqa [r4],xmm4
+ LOAD_5_PARA_POP
+ ret
+
+WELS_EXTERN WelsSampleSadFour4x4_sse2
+WelsSampleSadFour4x4_sse2:
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+ ;mov edi, [esp+20]
+ ;mov edx, [esp+24]
+
+ %assign push_num 0
+ LOAD_5_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd xmm0, [r0]
+ movd xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movd xmm2, [r0]
+ movd xmm3, [r0+r1]
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+ sub r2, r3
+ movd xmm1, [r2]
+ movd xmm2, [r2+r3]
+ punpckldq xmm1, xmm2
+ movd xmm2, [r2+r3-1]
+ movd xmm3, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+
+ movd xmm4, [r2]
+ movd xmm5, [r2-1]
+ punpckldq xmm2, xmm5
+ movd xmm5, [r2+1]
+ punpckldq xmm3, xmm5
+
+ movd xmm5, [r2+r3]
+ punpckldq xmm4, xmm5
+
+ punpcklqdq xmm1, xmm4 ;-L
+
+ movd xmm5, [r2+r3-1]
+ movd xmm6, [r2+r3+1]
+
+ lea r2, [r2+2*r3]
+ movd xmm7, [r2-1]
+ punpckldq xmm5, xmm7
+ punpcklqdq xmm2, xmm5 ;-1
+ movd xmm7, [r2+1]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm3, xmm6 ;+1
+ movd xmm6, [r2]
+ movd xmm7, [r2+r3]
+ punpckldq xmm6, xmm7
+ punpcklqdq xmm4, xmm6 ;+L
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movhlps xmm0, xmm2
+ paddw xmm2, xmm0
+ movhlps xmm0, xmm3
+ paddw xmm3, xmm0
+ movhlps xmm0, xmm4
+ paddw xmm4, xmm0
+ ;mov edi, [esp+28]
+ punpckldq xmm1, xmm4
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm1, xmm2
+ movdqa [r4],xmm1
+ LOAD_5_PARA_POP
+ ret
+
+;***********************************************************************
+;
+;Pixel_sad_4_wxh_sse2 END
+;
+;***********************************************************************
+
+WELS_EXTERN WelsSampleSad4x4_mmx
+
+align 16
+;***********************************************************************
+; int32_t __cdecl WelsSampleSad4x4_mmx (uint8_t *, int32_t, uint8_t *, int32_t )
+;***********************************************************************
+WelsSampleSad4x4_mmx:
+ ;push ebx
+ ;%define pushsize 4
+ ;%define pix1address esp+pushsize+4
+ ;%define pix1stride esp+pushsize+8
+ ;%define pix2address esp+pushsize+12
+ ;%define pix2stride esp+pushsize+16
+ ;mov eax, [pix1address]
+ ;mov ebx, [pix1stride ]
+ ;mov ecx, [pix2address]
+ ;mov edx, [pix2stride ]
+
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ movd mm0, [r0]
+ movd mm1, [r0+r1]
+ punpckldq mm0, mm1
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm0, mm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+
+ movd mm1, [r0]
+ movd mm2, [r0+r1]
+ punpckldq mm1, mm2
+
+ movd mm3, [r2]
+ movd mm4, [r2+r3]
+ punpckldq mm3, mm4
+ psadbw mm1, mm3
+ paddw mm0, mm1
+
+ movd retrd, mm0
+
+ WELSEMMS
+ LOAD_4_PARA_POP
+ ret
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -42,8 +42,6 @@
%include "asm_inc.asm"
-bits 32
-
;***********************************************************************
; Macros
;***********************************************************************
@@ -171,25 +169,34 @@
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_sse2
WelsScan4x4DcAc_sse2:
-
- mov eax, [esp+8]
- movdqa xmm0, [eax] ; 7 6 5 4 3 2 1 0
- movdqa xmm1, [eax+16] ; f e d c b a 9 8
- pextrw ecx, xmm0, 7 ; ecx = 7
- pextrw edx, xmm1, 2 ; edx = a
- pextrw eax, xmm0, 5 ; eax = 5
- pinsrw xmm1, ecx, 2 ; f e d c b 7 9 8
- pinsrw xmm0, eax, 7 ; 5 6 5 4 3 2 1 0
- pextrw ecx, xmm1, 0 ; ecx = 8
- pinsrw xmm0, ecx, 5 ; 5 6 8 4 3 2 1 0
- pinsrw xmm1, edx, 0 ; f e d c b 7 9 a
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_2_PARA
+ ;mov eax, [esp+8]
+ movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
+ movdqa xmm1, [r1+16] ; f e d c b a 9 8
+ pextrw r2d, xmm0, 7 ; ecx = 7
+ pextrw r3d, xmm1, 2 ; edx = a
+ pextrw r1d, xmm0, 5 ; eax = 5
+ pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
+ pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
+ pextrw r2d, xmm1, 0 ; ecx = 8
+ pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
+ pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
- mov eax, [esp+4]
- movdqa [eax],xmm0
- movdqa [eax+16], xmm1
+ ;mov eax, [esp+4]
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
+ %ifdef X86_32
+ pop r3
+ %endif
ret
;***********************************************************************
@@ -198,19 +205,21 @@
ALIGN 16
WELS_EXTERN WelsScan4x4DcAc_ssse3
WelsScan4x4DcAc_ssse3:
- mov eax, [esp+8]
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
- pextrw ecx, xmm0, 7 ; ecx = [7]
- pextrw eax, xmm1, 0 ; eax = [8]
- pinsrw xmm0, eax, 7 ; xmm0[7] = [8]
- pinsrw xmm1, ecx, 0 ; xmm1[0] = [7]
+ %assign push_num 0
+ LOAD_2_PARA
+ ;mov eax, [esp+8]
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
+ pextrw r2d, xmm0, 7 ; ecx = [7]
+ pextrw r1d, xmm1, 0 ; eax = [8]
+ pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
+ pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
pshufb xmm1, [pb_scanacdc_maskb]
pshufb xmm0, [pb_scanacdc_maska]
- mov eax, [esp+4]
- movdqa [eax],xmm0
- movdqa [eax+16], xmm1
+ ;mov eax, [esp+4]
+ movdqa [r0],xmm0
+ movdqa [r0+16], xmm1
ret
;***********************************************************************
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
@@ -218,9 +227,11 @@
ALIGN 16
WELS_EXTERN WelsScan4x4Ac_sse2
WelsScan4x4Ac_sse2:
- mov eax, [esp+8]
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
+ %assign push_num 0
+ LOAD_2_PARA
+ ;mov eax, [esp+8]
+ movdqa xmm0, [r1]
+ movdqa xmm1, [r1+16]
movdqa xmm2, xmm0
punpcklqdq xmm0, xmm1
punpckhqdq xmm2, xmm1
@@ -228,14 +239,14 @@
movdqa xmm3, xmm0
punpckldq xmm0, xmm2
punpckhdq xmm3, xmm2
- pextrw eax , xmm0, 3
- pextrw edx , xmm0, 7
- pinsrw xmm0, eax, 7
- pextrw eax, xmm3, 4
- pinsrw xmm3, edx, 4
- pextrw edx, xmm3, 0
- pinsrw xmm3, eax, 0
- pinsrw xmm0, edx, 3
+ pextrw r1d , xmm0, 3
+ pextrw r2d , xmm0, 7
+ pinsrw xmm0, r1d, 7
+ pextrw r1d, xmm3, 4
+ pinsrw xmm3, r2d, 4
+ pextrw r2d, xmm3, 0
+ pinsrw xmm3, r1d, 0
+ pinsrw xmm0, r2d, 3
pshufhw xmm1, xmm0, 0x93
pshuflw xmm2, xmm3, 0x39
@@ -245,9 +256,9 @@
pslldq xmm3, 14
por xmm1, xmm3
psrldq xmm2, 2
- mov eax, [esp+4]
- movdqa [eax],xmm1
- movdqa [eax+16], xmm2
+ ;mov eax, [esp+4]
+ movdqa [r0],xmm1
+ movdqa [r0+16], xmm2
ret
@@ -257,44 +268,60 @@
ALIGN 16
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
WelsCalculateSingleCtr4x4_sse2:
- push ebx
- mov eax, [esp+8]
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
+ ;push ebx
+ ;mov eax, [esp+8]
+ %ifdef X86_32
+ push r3
+ %assign push_num 1
+ %else
+ %assign push_num 0
+ %endif
+ LOAD_1_PARA
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
packsswb xmm0, xmm1
-
+ ; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
+ xor r3, r3
pxor xmm3, xmm3
pcmpeqb xmm0, xmm3
- pmovmskb edx, xmm0
+ pmovmskb r3d, xmm0
- xor edx, 0xffff
+ xor r3, 0xffff
- xor eax, eax
- mov ecx, 7
- mov ebx, 8
+ xor r0, r0
+ mov r2, 7
+ mov r1, 8
.loop_low8_find1:
- bt edx, ecx
+ bt r3, r2
jc .loop_high8_find1
- loop .loop_low8_find1
+ dec r2
+ jnz .loop_low8_find1
.loop_high8_find1:
- bt edx, ebx
+ bt r3, r1
jc .find1end
- inc ebx
- cmp ebx,16
+ inc r1
+ cmp r1,16
jb .loop_high8_find1
.find1end:
- sub ebx, ecx
- sub ebx, 1
- add al, [i_ds_table+ebx]
- mov ebx, edx
- and edx, 0xff
- shr ebx, 8
- and ebx, 0xff
- add al, [low_mask_table +edx]
- add al, [high_mask_table+ebx]
-
- pop ebx
+ sub r1, r2
+ sub r1, 1
+ lea r2, [i_ds_table]
+ add r0b, [r2+r1]
+ mov r1, r3
+ and r3, 0xff
+ shr r1, 8
+ and r1, 0xff
+ lea r2 , [low_mask_table]
+ add r0b, [r2 +r3]
+ lea r2, [high_mask_table]
+ add r0b, [r2+r1]
+ %ifdef X86_32
+ pop r3
+ %else
+ mov retrd, r0d
+ %endif
+ ;pop ebx
ret
@@ -304,21 +331,29 @@
ALIGN 16
WELS_EXTERN WelsGetNoneZeroCount_sse2
WelsGetNoneZeroCount_sse2:
- mov eax, [esp+4]
- movdqa xmm0, [eax]
- movdqa xmm1, [eax+16]
+ %assign push_num 0
+ LOAD_1_PARA
+ ;mov eax, [esp+4]
+ movdqa xmm0, [r0]
+ movdqa xmm1, [r0+16]
pxor xmm2, xmm2
pcmpeqw xmm0, xmm2
pcmpeqw xmm1, xmm2
packsswb xmm1, xmm0
- pmovmskb edx, xmm1
- xor edx, 0xffff
- mov ecx, edx
- and edx, 0xff
- shr ecx, 8
+ xor r1, r1
+ pmovmskb r1d, xmm1
+ xor r1d, 0xffff
+ mov r2, r1
+ and r1, 0xff
+ shr r2, 8
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
- xor eax, eax
- add al, [nozero_count_table+ecx]
- add al, [nozero_count_table+edx]
+; xor retr, retr
+ ;add al, [nozero_count_table+r2]
+ lea r0 , [nozero_count_table]
+ movzx r2, byte [r0+r2]
+ movzx r1, byte [r0+r1]
+ mov retrq, r2
+ add retrq, r1
+ ;add al, [nozero_count_table+r1]
ret
--- a/codec/encoder/core/asm/vaa.asm
+++ /dev/null
@@ -1,403 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* vaa.asm
-;*
-;* Abstract
-;* sse2 for pVaa routines
-;*
-;* History
-;* 04/14/2010 Created
-;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
-;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
-;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
-
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $4
-%endmacro
-
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-; dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-; , 6/7/2010
-
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; iLineSize
-
- mov ebx, ecx
- sal ebx, $1 ; iLineSize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; iLineSize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; iLineSize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+24], xmm0
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low word truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; iLineSize
-
- mov ebx, ecx
- sal ebx, $1 ; iLineSize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; iLineSize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; iLineSize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+24], xmm1
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low work truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN MdInterAnalysisVaaInfo_sse41
-;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
-;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse41:
- mov eax, [esp+4]
- movdqa xmm0, [eax] ; load 4 sad_8x8
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
- pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
- pshufd xmm4, xmm3, 01Bh
- paddd xmm4, xmm3
- pshufd xmm3, xmm4, 0B1h
- paddd xmm3, xmm4
- movd eax, xmm3
- cmp eax, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 0B1h
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps eax, xmm0
- ret
-.threshold_exit:
- mov eax, 15
- ret
-
-WELS_EXTERN MdInterAnalysisVaaInfo_sse2
-;***********************************************************************
-; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
-;***********************************************************************
-ALIGN 16
-MdInterAnalysisVaaInfo_sse2:
- mov eax, [esp+4]
- movdqa xmm0, [eax] ; load 4 sad_8x8
- pshufd xmm1, xmm0, 01Bh
- paddd xmm1, xmm0
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
- psrad xmm1, 02h ; iAverageSad
- movdqa xmm2, xmm1
- psrad xmm2, 06h
- movdqa xmm3, xmm0 ; iSadBlock
- psrad xmm3, 06h
- psubd xmm3, xmm2
-
- ; to replace pmulld functionality as below
- movdqa xmm2, xmm3
- pmuludq xmm2, xmm3
- pshufd xmm4, xmm3, 0B1h
- pmuludq xmm4, xmm4
- movdqa xmm5, xmm2
- punpckldq xmm5, xmm4
- punpckhdq xmm2, xmm4
- punpcklqdq xmm5, xmm2
-
- pshufd xmm4, xmm5, 01Bh
- paddd xmm4, xmm5
- pshufd xmm5, xmm4, 0B1h
- paddd xmm5, xmm4
- movd eax, xmm5
- cmp eax, 20 ; INTER_VARIANCE_SAD_THRESHOLD
- jb near .threshold_exit
- pshufd xmm0, xmm0, 0B1h
- pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
- movmskps eax, xmm0
- ret
-.threshold_exit:
- mov eax, 15
- ret
--- a/codec/encoder/core/inc/mc.h
+++ b/codec/encoder/core/inc/mc.h
@@ -61,15 +61,15 @@
void McCopyWidthEq8_mmx (uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
void PixelAvgWidthEq8_mmx (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
-void McHorVer20_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer20Width9Or17_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
int32_t iHeight);
-void McHorVer02_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer02Height9Or17_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
int32_t iHeight);
void McHorVer22HorFirst_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pTap, int32_t iTapStride, int32_t iWidth,
int32_t iHeight);
-void McHorVer22VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer22Width8VerLastAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
int32_t iHeight);
-void McHorVer22VerLastUnAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
+void McHorVer22Width8VerLastUnAlign_sse2 (uint8_t* pTap, int32_t iTapStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight);
void McChromaWidthEq8_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, const uint8_t* kpABCD,
int32_t iHeigh);
@@ -80,8 +80,6 @@
int32_t iHeight);
void PixelAvgWidthEq16_sse2 (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
-
-void PixelAvgWidthEq16_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, uint8_t*, int32_t, int32_t);
void McChromaWidthEq8_ssse3 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
const uint8_t* kpABCD, int32_t iHeigh);
--- a/codec/encoder/core/inc/sample.h
+++ b/codec/encoder/core/inc/sample.h
@@ -107,9 +107,6 @@
int32_t WelsIntra16x16Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*);
int32_t WelsIntraChroma8x8Combined3Satd_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
uint8_t*, uint8_t*);
-int32_t WelsIntraChroma8x8Combined3Sad_ssse3 (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
- uint8_t*, uint8_t*);
-
#endif//X86_ASM
--- a/codec/encoder/core/src/deblocking.cpp
+++ b/codec/encoder/core/src/deblocking.cpp
@@ -994,15 +994,15 @@
#ifdef X86_ASM
- if (iCpu & WELS_CPU_SSE2) {
+ if (iCpu & WELS_CPU_SSE2) {
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_sse2;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_sse2;
- pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_sse2;
- pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_sse2;
+ pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_sse2;
+ pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_sse2;
pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_sse2;
pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_sse2;
pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_sse2;
- pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_sse2;
+ pFunc->pfChromaDeblockinEQ4Hor = DeblockChromaEq4H_sse2;
}
#endif
}
--- a/codec/encoder/core/src/decode_mb_aux.cpp
+++ b/codec/encoder/core/src/decode_mb_aux.cpp
@@ -261,15 +261,15 @@
#if defined(X86_ASM)
if (uiCpuFlag & WELS_CPU_MMXEXT) {
- pFuncList->pfIDctT4 = WelsIDctT4Rec_mmx;
+ // pFuncList->pfIDctT4 = WelsIDctT4Rec_mmx;
}
if (uiCpuFlag & WELS_CPU_SSE2) {
- pFuncList->pfDequantization4x4 = WelsDequant4x4_sse2;
+ /* pFuncList->pfDequantization4x4 = WelsDequant4x4_sse2;
pFuncList->pfDequantizationFour4x4 = WelsDequantFour4x4_sse2;
pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_sse2;
pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_sse2;
- pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;
+ pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_sse2;*/
}
#endif//X86_ASM
}
--- a/codec/encoder/core/src/encode_mb_aux.cpp
+++ b/codec/encoder/core/src/encode_mb_aux.cpp
@@ -569,7 +569,7 @@
}
//#ifndef MACOS
if (uiCpuFlag & WELS_CPU_SSSE3) {
- pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
+ // pFuncList->pfScan4x4 = WelsScan4x4DcAc_ssse3;
}
//#endif//MACOS
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -1919,7 +1919,7 @@
uiCpuCores,
iCacheLineSize);
-#ifdef _DEBUG // output at console & _debug
+//#ifdef _DEBUG // output at console & _debug
fprintf (stderr, "WELS CPU features/capacities (0x%x) detected: \n" \
"HTT: %c, " \
"MMX: %c, " \
@@ -1962,7 +1962,7 @@
(uiCpuFeatureFlags & WELS_CPU_AES) ? 'Y' : 'N',
uiCpuCores,
iCacheLineSize);
-#endif//_DEBUG
+//#endif//_DEBUG
}
/*!
--- a/codec/encoder/core/src/expand_pic.cpp
+++ b/codec/encoder/core/src/expand_pic.cpp
@@ -29,14 +29,12 @@
* POSSIBILITY OF SUCH DAMAGE.
*
*/
-
#include <string.h>
#include "expand_pic.h"
#include "cpu_core.h"
#include "wels_func_ptr_def.h"
-namespace WelsSVCEnc {
-
+namespace WelsSVCEnc{
// rewrite it (split into luma & chroma) that is helpful for mmx/sse2 optimization perform, 9/27/2009
static inline void ExpandPictureLuma_c (uint8_t* pDst, const int32_t kiStride, const int32_t kiPicW,
const int32_t kiPicH) {
@@ -144,6 +142,8 @@
const int32_t kiWidthUV = kiWidthY >> 1;
const int32_t kiHeightUV = kiHeightY >> 1;
+
+
pExpLuma (pPicY, pPic->iLineSize[0], kiWidthY, kiHeightY);
if (kiWidthUV >= 16) {
// fix coding picture size as 16x16
@@ -155,6 +155,7 @@
ExpandPictureChroma_c (pPicCb, pPic->iLineSize[1], kiWidthUV, kiHeightUV);
ExpandPictureChroma_c (pPicCr, pPic->iLineSize[2], kiWidthUV, kiHeightUV);
}
+
}
}
--- a/codec/encoder/core/src/get_intra_predictor.cpp
+++ b/codec/encoder/core/src/get_intra_predictor.cpp
@@ -80,13 +80,13 @@
#if defined(X86_ASM)
if (kuiCpuFlag & WELS_CPU_MMXEXT) {
- WelsFillingPred8to16 = WelsFillingPred8to16_mmx;
- WelsFillingPred8x2to16 = WelsFillingPred8x2to16_mmx;
- WelsFillingPred1to16 = WelsFillingPred1to16_mmx;
+ // WelsFillingPred8to16 = WelsFillingPred8to16_mmx;
+ // WelsFillingPred8x2to16 = WelsFillingPred8x2to16_mmx;
+ // WelsFillingPred1to16 = WelsFillingPred1to16_mmx;
}
if (kuiCpuFlag & WELS_CPU_SSE2) {
- WelsFillingPred8x2to16 = WelsFillingPred8x2to16_sse2;
- WelsFillingPred1to16 = WelsFillingPred1to16_sse2;
+ // WelsFillingPred8x2to16 = WelsFillingPred8x2to16_sse2;
+ // WelsFillingPred1to16 = WelsFillingPred1to16_sse2;
}
#endif//X86_ASM
}
--- a/codec/encoder/core/src/mc.cpp
+++ b/codec/encoder/core/src/mc.cpp
@@ -426,7 +426,7 @@
int32_t iHeight) {
ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 21, 8, 16)
McHorVer22Width8HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 16, iHeight + 5);
- McHorVer22VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
+ McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 16, pDst, iDstStride, 8, iHeight);
}
//2010.2.5
@@ -441,13 +441,13 @@
McHorVer22WidthEq8_sse2 (pSrc, iSrcStride, pDst, iDstStride, iHeight);
McHorVer22WidthEq8_sse2 (&pSrc[8], iSrcStride, &pDst[8], iDstStride, iHeight);
}
-void McHorVer22_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
+void McHorVer22Width9Or17Height9Or17_sse2 (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth,
int32_t iHeight) {
ENFORCE_STACK_ALIGN_2D (int16_t, pTap, 22, 24, 16)
int32_t tmp1 = 2 * (iWidth - 8);
McHorVer22HorFirst_sse2 (pSrc - 2, iSrcStride, (uint8_t*)pTap, 48, iWidth, iHeight + 5);
- McHorVer22VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
- McHorVer22VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight);
+ McHorVer22Width8VerLastAlign_sse2 ((uint8_t*)pTap, 48, pDst, iDstStride, iWidth - 1, iHeight);
+ McHorVer22Width8VerLastUnAlign_sse2 ((uint8_t*)pTap + tmp1, 48, pDst + iWidth - 8, iDstStride, 8, iHeight);
}
typedef void (*McChromaWidthEqx) (uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
@@ -523,9 +523,9 @@
pFuncList->sMcFuncs.pfLumaQuarpelMc = pWelsMcFuncWidthEq16;
#if defined (X86_ASM)
if (uiCpuFlag & WELS_CPU_SSE2) {
- pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20_sse2;
- pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02_sse2;
- pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22_sse2;
+ pFuncList->sMcFuncs.pfLumaHalfpelHor = McHorVer20Width9Or17_sse2;
+ pFuncList->sMcFuncs.pfLumaHalfpelVer = McHorVer02Height9Or17_sse2;
+ pFuncList->sMcFuncs.pfLumaHalfpelCen = McHorVer22Width9Or17Height9Or17_sse2;
pFuncList->sMcFuncs.pfSampleAveraging[0] = PixelAvgWidthEq8_mmx;
pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_sse2;
pFuncList->sMcFuncs.pfChromaMc = McChroma_sse2;
@@ -541,7 +541,6 @@
if (uiCpuFlag & WELS_CPU_SSSE3) {
pFuncList->sMcFuncs.pfChromaMc = McChroma_ssse3;
- pFuncList->sMcFuncs.pfSampleAveraging[1] = PixelAvgWidthEq16_ssse3;
}
#endif //(X86_ASM)
--- a/codec/encoder/core/src/md.cpp
+++ b/codec/encoder/core/src/md.cpp
@@ -439,7 +439,7 @@
return (uiMbSign);
}
-static inline int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) {
+int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) {
ENFORCE_STACK_ALIGN_1D (uint16_t, uiAvgBlock, 16, 16)
uint16_t* pBlock = &uiAvgBlock[0];
uint8_t* pEncData = pDataY;
--- a/codec/encoder/core/src/sample.cpp
+++ b/codec/encoder/core/src/sample.cpp
@@ -465,11 +465,11 @@
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_sse2;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x8 ] = WelsSampleSatd16x8_sse2;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_sse2;
- pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSmpleSatdThree4x4_sse2;
+ //pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = WelsSmpleSatdThree4x4_sse2;
}
if (uiCpuFlag & WELS_CPU_SSSE3) {
- pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
+ //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Sad = WelsIntra16x16Combined3Sad_ssse3;
}
if (uiCpuFlag & WELS_CPU_SSE41) {
@@ -478,8 +478,8 @@
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16] = WelsSampleSatd8x16_sse41;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8] = WelsSampleSatd8x8_sse41;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4] = WelsSampleSatd4x4_sse41;
- pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
- pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
+ //pFuncList->sSampleDealingFuncs.pfIntra16x16Combined3Satd = WelsIntra16x16Combined3Satd_sse41;
+ //pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = WelsIntraChroma8x8Combined3Satd_sse41;
}
#endif //(X86_ASM)
--- a/codec/encoder/core/src/set_mb_syn_cavlc.cpp
+++ b/codec/encoder/core/src/set_mb_syn_cavlc.cpp
@@ -207,7 +207,7 @@
#if defined(X86_ASM)
if (uiCpuFlag & WELS_CPU_SSE2) {
- sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
+ // sCoeffFunc.pfCavlcParamCal = CavlcParamCal_sse2;
}
#endif
}
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -38,22 +38,13 @@
ENCODER_OBJS += $(ENCODER_CPP_SRCS:.cpp=.o)
ifeq ($(USE_ASM), Yes)
ENCODER_ASM_SRCS=\
- $(ENCODER_SRCDIR)/./core/asm/asm_inc.asm\
$(ENCODER_SRCDIR)/./core/asm/coeff.asm\
- $(ENCODER_SRCDIR)/./core/asm/cpuid.asm\
$(ENCODER_SRCDIR)/./core/asm/dct.asm\
- $(ENCODER_SRCDIR)/./core/asm/deblock.asm\
- $(ENCODER_SRCDIR)/./core/asm/expand_picture.asm\
$(ENCODER_SRCDIR)/./core/asm/intra_pred.asm\
- $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.asm\
- $(ENCODER_SRCDIR)/./core/asm/mb_copy.asm\
- $(ENCODER_SRCDIR)/./core/asm/mc_chroma.asm\
- $(ENCODER_SRCDIR)/./core/asm/mc_luma.asm\
$(ENCODER_SRCDIR)/./core/asm/memzero.asm\
$(ENCODER_SRCDIR)/./core/asm/quant.asm\
$(ENCODER_SRCDIR)/./core/asm/satd_sad.asm\
$(ENCODER_SRCDIR)/./core/asm/score.asm\
- $(ENCODER_SRCDIR)/./core/asm/vaa.asm\
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.o)
endif
@@ -158,39 +149,15 @@
$(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.o: $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.cpp
$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(ENCODER_CFLAGS) $(ENCODER_INCLUDES) -c -o $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.o $(ENCODER_SRCDIR)/./plus/src/welsEncoderExt.cpp
-$(ENCODER_SRCDIR)/./core/asm/asm_inc.o: $(ENCODER_SRCDIR)/./core/asm/asm_inc.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/asm_inc.o $(ENCODER_SRCDIR)/./core/asm/asm_inc.asm
-
$(ENCODER_SRCDIR)/./core/asm/coeff.o: $(ENCODER_SRCDIR)/./core/asm/coeff.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/coeff.o $(ENCODER_SRCDIR)/./core/asm/coeff.asm
-$(ENCODER_SRCDIR)/./core/asm/cpuid.o: $(ENCODER_SRCDIR)/./core/asm/cpuid.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/cpuid.o $(ENCODER_SRCDIR)/./core/asm/cpuid.asm
-
$(ENCODER_SRCDIR)/./core/asm/dct.o: $(ENCODER_SRCDIR)/./core/asm/dct.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/dct.o $(ENCODER_SRCDIR)/./core/asm/dct.asm
-$(ENCODER_SRCDIR)/./core/asm/deblock.o: $(ENCODER_SRCDIR)/./core/asm/deblock.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/deblock.o $(ENCODER_SRCDIR)/./core/asm/deblock.asm
-
-$(ENCODER_SRCDIR)/./core/asm/expand_picture.o: $(ENCODER_SRCDIR)/./core/asm/expand_picture.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/expand_picture.o $(ENCODER_SRCDIR)/./core/asm/expand_picture.asm
-
$(ENCODER_SRCDIR)/./core/asm/intra_pred.o: $(ENCODER_SRCDIR)/./core/asm/intra_pred.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/intra_pred.o $(ENCODER_SRCDIR)/./core/asm/intra_pred.asm
-$(ENCODER_SRCDIR)/./core/asm/intra_pred_util.o: $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.o $(ENCODER_SRCDIR)/./core/asm/intra_pred_util.asm
-
-$(ENCODER_SRCDIR)/./core/asm/mb_copy.o: $(ENCODER_SRCDIR)/./core/asm/mb_copy.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/mb_copy.o $(ENCODER_SRCDIR)/./core/asm/mb_copy.asm
-
-$(ENCODER_SRCDIR)/./core/asm/mc_chroma.o: $(ENCODER_SRCDIR)/./core/asm/mc_chroma.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/mc_chroma.o $(ENCODER_SRCDIR)/./core/asm/mc_chroma.asm
-
-$(ENCODER_SRCDIR)/./core/asm/mc_luma.o: $(ENCODER_SRCDIR)/./core/asm/mc_luma.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/mc_luma.o $(ENCODER_SRCDIR)/./core/asm/mc_luma.asm
-
$(ENCODER_SRCDIR)/./core/asm/memzero.o: $(ENCODER_SRCDIR)/./core/asm/memzero.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/memzero.o $(ENCODER_SRCDIR)/./core/asm/memzero.asm
@@ -202,9 +169,6 @@
$(ENCODER_SRCDIR)/./core/asm/score.o: $(ENCODER_SRCDIR)/./core/asm/score.asm
$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/score.o $(ENCODER_SRCDIR)/./core/asm/score.asm
-
-$(ENCODER_SRCDIR)/./core/asm/vaa.o: $(ENCODER_SRCDIR)/./core/asm/vaa.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(ENCODER_ASMFLAGS) $(ENCODER_ASM_INCLUDES) -o $(ENCODER_SRCDIR)/./core/asm/vaa.o $(ENCODER_SRCDIR)/./core/asm/vaa.asm
$(LIBPREFIX)encoder.$(LIBSUFFIX): $(ENCODER_OBJS)
rm -f $(LIBPREFIX)encoder.$(LIBSUFFIX)
--- /dev/null
+++ b/codec/processing/build/linux/makefile
@@ -1,0 +1,94 @@
+NASM = 1
+NAME = libwelsvp
+
+OUTDIR = ../../../bin/linux
+BINDIR = ../../bin
+OBJDIR = ../../obj
+SRCDIRS = ../../src/asm \
+ ../../src/common \
+ ../../src/adaptivequantization \
+ ../../src/backgounddetection \
+ ../../src/denoise \
+ ../../src/downsample \
+ ../../src/scenechangedetection \
+ ../../src/vaacalc \
+ ../../src/complexityanalysis
+SRCDIRS += ../../src/imagerotate
+
+
+TARGETLIB = $(BINDIR)/$(NAME).so
+
+CC = $(shell which gcc)
+AS = $(shell which nasm)
+GCC = gcc -m32
+
+CPPFLAGS = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
+LDFLAGS = -lstdc++ -ldl
+
+SRCEXTS = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS = .h
+SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP = $(filter %.cpp,$(SOURCES))
+SRC_ASM = $(filter %.asm,$(SOURCES))
+OBJS = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS = $(OBJS:.o=.d)
+
+DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+ echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm = $(AS) $(ASMFLAGS)
+LINK = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+
+%.d:%.cpp
+ @echo -n $(dir $<) > $@
+ @$(DEPEND_cpp.d) $< >> $@
+
+%.d:%.asm
+ @echo -n $(dir $<) > $@
+ @$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+ $(COMPILE.cpp) $< -o $@
+
+%.o:%.asm
+ $(COMPILE.asm) $< -o $@
+
+tags: $(HEADERS) $(SOURCES)
+ etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+ ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+ @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+ $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+ @echo produce the lib to $(TARGETLIB).
+ @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+ @cp -f $(TARGETLIB) $(OUTDIR)
+ @cp -f $(TARGETLIB) ../../../testbin
+ @echo copy the lib to $(OUTDIR).
+
+clean:
+ rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+ rm -f $(DEPS) TAGS
+
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2008.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
binary files /dev/null b/codec/processing/build/win32/WelsVP_2008.suo differ
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -1,0 +1,846 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9.00"
+ Name="WelsVP"
+ ProjectGUID="{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+ RootNamespace="WelsVP"
+ Keyword="Win32Proj"
+ TargetFrameworkVersion="196613"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ <Platform
+ Name="x64"
+ />
+ </Platforms>
+ <ToolFiles>
+ <DefaultToolFile
+ FileName="masm.rules"
+ />
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory=".\..\..\..\bin\win32\Debug"
+ IntermediateDirectory=".\..\..\..\obj\vp\Debug"
+ ConfigurationType="2"
+ CharacterSet="1"
+ WholeProgramOptimization="0"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine=""
+ />
+ <Tool
+ Name="MASM"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories=""
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="1"
+ UsePrecompiledHeader="0"
+ AssemblerListingLocation=""
+ WarningLevel="3"
+ DebugInformationFormat="4"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ LinkLibraryDependencies="true"
+ OutputFile="$(OutDir)\welsvp.dll"
+ LinkIncremental="2"
+ ModuleDefinitionFile="../../src/common/WelsVP.def"
+ GenerateDebugInformation="true"
+ GenerateMapFile="true"
+ MapFileName="$(OutDir)\welsvp.map"
+ SubSystem="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ CommandLine=""
+ />
+ </Configuration>
+ <Configuration
+ Name="Debug|x64"
+ OutputDirectory=".\..\..\..\..\bin\win64\Debug"
+ IntermediateDirectory=".\..\..\..\obj\vp\Debug"
+ ConfigurationType="2"
+ CharacterSet="1"
+ WholeProgramOptimization="0"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine=""
+ />
+ <Tool
+ Name="MASM"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories=""
+ PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="1"
+ UsePrecompiledHeader="0"
+ AssemblerListingLocation=""
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ LinkLibraryDependencies="true"
+ OutputFile="$(OutDir)\welsvp.dll"
+ LinkIncremental="2"
+ ModuleDefinitionFile="../../src/common/WelsVP.def"
+ GenerateDebugInformation="true"
+ GenerateMapFile="true"
+ MapFileName="$(OutDir)\welsvp.map"
+ SubSystem="2"
+ TargetMachine="17"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ CommandLine=""
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory=".\..\..\..\bin\win32\Release"
+ IntermediateDirectory=".\..\..\..\obj\vp\Release"
+ ConfigurationType="2"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ CommandLine=""
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine=""
+ />
+ <Tool
+ Name="MASM"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ EnableIntrinsicFunctions="false"
+ FavorSizeOrSpeed="1"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+ RuntimeLibrary="0"
+ EnableFunctionLevelLinking="false"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ DebugInformationFormat="0"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ OutputFile="$(OutDir)\welsvp.dll"
+ LinkIncremental="1"
+ GenerateManifest="false"
+ EnableUAC="false"
+ ModuleDefinitionFile="../../src/common/WelsVP.def"
+ GenerateDebugInformation="false"
+ GenerateMapFile="false"
+ MapFileName=""
+ MapExports="false"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ CommandLine=""
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|x64"
+ OutputDirectory=".\..\..\..\..\bin\win64\Release"
+ IntermediateDirectory=".\..\..\..\obj\vp\Release"
+ ConfigurationType="2"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ CommandLine=""
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine=""
+ />
+ <Tool
+ Name="MASM"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="3"
+ EnableIntrinsicFunctions="false"
+ FavorSizeOrSpeed="1"
+ PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
+ RuntimeLibrary="0"
+ EnableFunctionLevelLinking="false"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ DebugInformationFormat="0"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ OutputFile="$(OutDir)\welsvp.dll"
+ LinkIncremental="1"
+ GenerateManifest="false"
+ EnableUAC="false"
+ ModuleDefinitionFile="../../src/common/WelsVP.def"
+ GenerateDebugInformation="false"
+ GenerateMapFile="false"
+ MapFileName=""
+ MapExports="false"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="17"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ CommandLine=""
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\..\src\common\cpu.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\memory.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\thread.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\util.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\WelsFrameWork.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\WelsFrameWorkEx.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Interface"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\..\interface\IWelsVP.h"
+ >
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCLCompilerTool"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\src\common\resource.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+ >
+ <File
+ RelativePath="..\..\src\common\WelsVP.def"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\WelsVP.rc"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ >
+ <File
+ RelativePath="..\..\src\common\cpu.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\memory.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\thread.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\typedef.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\util.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\version.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\common\WelsFrameWork.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="ASM"
+ >
+ <File
+ RelativePath="..\..\..\common\cpuid.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\src\asm\denoisefilter.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\src\asm\downsample_bilinear.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\src\asm\intra_pred.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\src\asm\sad.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ <File
+ RelativePath="..\..\src\asm\vaa.asm"
+ >
+ <FileConfiguration
+ Name="Debug|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Debug|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|Win32"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ <FileConfiguration
+ Name="Release|x64"
+ >
+ <Tool
+ Name="VCCustomBuildTool"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ Outputs="$(IntDir)\$(InputName).obj"
+ />
+ </FileConfiguration>
+ </File>
+ </Filter>
+ <Filter
+ Name="SceneChangeDetection"
+ >
+ <File
+ RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Denoise"
+ >
+ <File
+ RelativePath="..\..\src\denoise\denoise.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\denoise\denoise.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\denoise\denoise_filter.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="VAACalc"
+ >
+ <File
+ RelativePath="..\..\src\vaacalc\vaacalcfuncs.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\vaacalc\vaacalculation.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\vaacalc\vaacalculation.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="BackgroundDetection"
+ >
+ <File
+ RelativePath="..\..\src\backgounddetection\BackgroundDetection.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\backgounddetection\BackgroundDetection.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="AdaptiveQuantization"
+ >
+ <File
+ RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Downsample"
+ >
+ <File
+ RelativePath="..\..\src\downsample\downsample.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\downsample\downsample.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\downsample\downsamplefuncs.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="ComplexityAnalysis"
+ >
+ <File
+ RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="ImageRotate"
+ >
+ <File
+ RelativePath="..\..\src\imagerotate\imagerotate.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\imagerotate\imagerotate.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\imagerotate\imagerotatefuncs.cpp"
+ >
+ </File>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2010.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual Studio 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
binary files /dev/null b/codec/processing/build/win32/WelsVP_2010.suo differ
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2010.vcxproj
@@ -1,0 +1,386 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
+ <RootNamespace>WelsVP</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>false</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>false</WholeProgramOptimization>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\..\..\bin\win32\Debug\</OutDir>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\..\..\bin\win64\Debug\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\..\..\obj\vp\Debug\</IntDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\..\..\obj\vp\Debug\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\..\..\bin\win32\Release\</OutDir>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\..\..\bin\win64\Release\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\..\..\obj\vp\Release\</IntDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\..\..\obj\vp\Release\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
+ <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</GenerateManifest>
+ <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</GenerateManifest>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+ <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
+ <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
+ <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsvp</TargetName>
+ <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">welsvp</TargetName>
+ <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsvp</TargetName>
+ <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">welsvp</TargetName>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <CustomBuildStep>
+ <Command>
+ </Command>
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <AssemblerListingLocation>
+ </AssemblerListingLocation>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <ProjectReference>
+ <LinkLibraryDependencies>true</LinkLibraryDependencies>
+ </ProjectReference>
+ <Link>
+ <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <SubSystem>Windows</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <CustomBuildStep>
+ <Command>
+ </Command>
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+ <PreprocessorDefinitions>WIN64;_DEBUG;X86_ASM;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <AssemblerListingLocation>
+ </AssemblerListingLocation>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <ProjectReference>
+ <LinkLibraryDependencies>true</LinkLibraryDependencies>
+ </ProjectReference>
+ <Link>
+ <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <SubSystem>Windows</SubSystem>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <PreBuildEvent>
+ <Command>
+ </Command>
+ </PreBuildEvent>
+ <CustomBuildStep>
+ <Command>
+ </Command>
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Full</Optimization>
+ <IntrinsicFunctions>false</IntrinsicFunctions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>false</FunctionLevelLinking>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>
+ </DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+ <EnableUAC>false</EnableUAC>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>false</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <MapExports>true</MapExports>
+ <SubSystem>Windows</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <PreBuildEvent>
+ <Command>
+ </Command>
+ </PreBuildEvent>
+ <CustomBuildStep>
+ <Command>
+ </Command>
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Full</Optimization>
+ <IntrinsicFunctions>false</IntrinsicFunctions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <PreprocessorDefinitions>WIN64;NDEBUG;X86_ASM;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>false</FunctionLevelLinking>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>
+ </DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <OutputFile>$(OutDir)welsvp.dll</OutputFile>
+ <EnableUAC>false</EnableUAC>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>false</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <MapExports>true</MapExports>
+ <SubSystem>Windows</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="..\..\src\common\cpu.cpp" />
+ <ClCompile Include="..\..\src\common\memory.cpp" />
+ <ClCompile Include="..\..\src\common\thread.cpp" />
+ <ClCompile Include="..\..\src\common\util.cpp" />
+ <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
+ <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
+ <ClCompile Include="..\..\src\denoise\denoise.cpp" />
+ <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
+ <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
+ <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
+ <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
+ <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
+ <ClCompile Include="..\..\src\downsample\downsample.cpp" />
+ <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
+ <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
+ <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
+ <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="..\..\interface\IWelsVP.h" />
+ <ClInclude Include="..\..\src\common\resource.h" />
+ <ClInclude Include="..\..\src\common\cpu.h" />
+ <ClInclude Include="..\..\src\common\memory.h" />
+ <ClInclude Include="..\..\src\common\thread.h" />
+ <ClInclude Include="..\..\src\common\typedef.h" />
+ <ClInclude Include="..\..\src\common\util.h" />
+ <ClInclude Include="..\..\src\common\version.h" />
+ <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
+ <ClInclude Include="..\..\src\denoise\denoise.h" />
+ <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
+ <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
+ <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
+ <ClInclude Include="..\..\src\downsample\downsample.h" />
+ <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
+ <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\..\src\common\WelsVP.def" />
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\sad.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\vaa.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\..\common\cpuid.asm">
+ <FileType>Document</FileType>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win32 -DX86_32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I ..\..\..\common\ -I%(RootDir)%(Directory) -f win64 -DWIN64 -o $(IntDir)%(Filename).obj %(FullPath)</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ </CustomBuild>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+ </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2010.vcxproj.filters
@@ -1,0 +1,162 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="..\..\interface\IWelsVP.h">
+ <Filter>headers</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\util.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\cpu.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\denoise\denoise.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\downsample\downsample.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\memory.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\thread.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\cpu.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\denoise\denoise.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\downsample\downsample.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\memory.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\resource.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\thread.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\typedef.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\util.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\version.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\WelsFrameWork.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\sad.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\vaa.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\..\common\cpuid.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ </ItemGroup>
+ <ItemGroup>
+ <Filter Include="ASM">
+ <UniqueIdentifier>{ecef07b7-65e1-45c4-9afc-39f7b07992a2}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="headers">
+ <UniqueIdentifier>{be24742a-75fa-49a4-b77e-a69d626d46c8}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="sources">
+ <UniqueIdentifier>{9f4c2bd3-e8d2-4276-adc6-273c0031971a}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="resources">
+ <UniqueIdentifier>{322f1cbe-435f-402b-8d86-71d023d5d407}</UniqueIdentifier>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\..\src\common\WelsVP.def">
+ <Filter>resources</Filter>
+ </None>
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="..\..\src\common\WelsVP.rc">
+ <Filter>resources</Filter>
+ </ResourceCompile>
+ </ItemGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2012.sln
@@ -1,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2012", "WelsVP_2012.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
binary files /dev/null b/codec/processing/build/win32/WelsVP_2012.v11.suo differ
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2012.vcxproj
@@ -1,0 +1,427 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Debug|x64">
+ <Configuration>Debug</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|x64">
+ <Configuration>Release</Configuration>
+ <Platform>x64</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
+ <RootNamespace>WelsVP</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <PlatformToolset>v110</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <PlatformToolset>v110</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <PlatformToolset>v110</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>false</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+ <ConfigurationType>DynamicLibrary</ConfigurationType>
+ <PlatformToolset>v110</PlatformToolset>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>false</WholeProgramOptimization>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <OutDir>.\..\..\..\bin\win32\Debug\</OutDir>
+ <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
+ <LinkIncremental>true</LinkIncremental>
+ <TargetName>welsvp</TargetName>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <LinkIncremental>true</LinkIncremental>
+ <TargetName>welsvp</TargetName>
+ <OutDir>.\..\..\..\bin\win64\Debug\</OutDir>
+ <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <OutDir>.\..\..\..\bin\win32\Release\</OutDir>
+ <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
+ <LinkIncremental>false</LinkIncremental>
+ <GenerateManifest>false</GenerateManifest>
+ <TargetName>welsvp</TargetName>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <LinkIncremental>false</LinkIncremental>
+ <GenerateManifest>false</GenerateManifest>
+ <TargetName>welsvp</TargetName>
+ <OutDir>.\..\..\..\bin\win64\Release\</OutDir>
+ <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <CustomBuildStep>
+ <Command />
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+ <PrecompiledHeader />
+ <AssemblerListingLocation />
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <ProjectReference>
+ <LinkLibraryDependencies>true</LinkLibraryDependencies>
+ </ProjectReference>
+ <Link>
+ <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <SubSystem>Windows</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+ <CustomBuildStep>
+ <Command>
+ </Command>
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <AssemblerListingLocation>
+ </AssemblerListingLocation>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <ProjectReference>
+ <LinkLibraryDependencies>true</LinkLibraryDependencies>
+ </ProjectReference>
+ <Link>
+ <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <SubSystem>Windows</SubSystem>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <PreBuildEvent>
+ <Command>
+ </Command>
+ </PreBuildEvent>
+ <CustomBuildStep>
+ <Command />
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Full</Optimization>
+ <IntrinsicFunctions>false</IntrinsicFunctions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>false</FunctionLevelLinking>
+ <PrecompiledHeader />
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat />
+ </ClCompile>
+ <Link>
+ <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+ <EnableUAC>false</EnableUAC>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>false</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <MapExports>true</MapExports>
+ <SubSystem>Windows</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+ <PreBuildEvent>
+ <Command>
+ </Command>
+ </PreBuildEvent>
+ <CustomBuildStep>
+ <Command>
+ </Command>
+ </CustomBuildStep>
+ <ClCompile>
+ <Optimization>Full</Optimization>
+ <IntrinsicFunctions>false</IntrinsicFunctions>
+ <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+ <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+ <FunctionLevelLinking>false</FunctionLevelLinking>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>
+ </DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
+ <EnableUAC>false</EnableUAC>
+ <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
+ <GenerateDebugInformation>false</GenerateDebugInformation>
+ <GenerateMapFile>true</GenerateMapFile>
+ <MapFileName>$(OutDir)\welsvp.map</MapFileName>
+ <MapExports>true</MapExports>
+ <SubSystem>Windows</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
+ <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
+ <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
+ </Link>
+ <PostBuildEvent>
+ <Command>
+ </Command>
+ </PostBuildEvent>
+ <Bscmake>
+ <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
+ </Bscmake>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="..\..\src\common\cpu.cpp" />
+ <ClCompile Include="..\..\src\common\memory.cpp" />
+ <ClCompile Include="..\..\src\common\thread.cpp" />
+ <ClCompile Include="..\..\src\common\util.cpp" />
+ <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
+ <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
+ <ClCompile Include="..\..\src\denoise\denoise.cpp" />
+ <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
+ <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
+ <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
+ <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
+ <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
+ <ClCompile Include="..\..\src\downsample\downsample.cpp" />
+ <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
+ <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
+ <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
+ <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="..\..\interface\IWelsVP.h" />
+ <ClInclude Include="..\..\src\common\resource.h" />
+ <ClInclude Include="..\..\src\common\cpu.h" />
+ <ClInclude Include="..\..\src\common\memory.h" />
+ <ClInclude Include="..\..\src\common\thread.h" />
+ <ClInclude Include="..\..\src\common\typedef.h" />
+ <ClInclude Include="..\..\src\common\util.h" />
+ <ClInclude Include="..\..\src\common\version.h" />
+ <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
+ <ClInclude Include="..\..\src\denoise\denoise.h" />
+ <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
+ <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
+ <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
+ <ClInclude Include="..\..\src\downsample\downsample.h" />
+ <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
+ <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\..\src\common\WelsVP.def" />
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\src\asm\asm_inc.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\cpuid.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\sad.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\vaa.asm">
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
+</Command>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
+ </CustomBuild>
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
+ </ImportGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVP_2012.vcxproj.filters
@@ -1,0 +1,165 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup>
+ <ClCompile Include="..\..\interface\IWelsVP.h">
+ <Filter>headers</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\cpu.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\denoise\denoise.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\downsample\downsample.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\memory.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\thread.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\util.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
+ <Filter>sources</Filter>
+ </ClCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\cpu.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\denoise\denoise.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\downsample\downsample.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\memory.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\resource.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\thread.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\typedef.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\util.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\version.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ <ClInclude Include="..\..\src\common\WelsFrameWork.h">
+ <Filter>headers</Filter>
+ </ClInclude>
+ </ItemGroup>
+ <ItemGroup>
+ <CustomBuild Include="..\..\src\asm\asm_inc.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\cpuid.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\intra_pred.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\sad.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ <CustomBuild Include="..\..\src\asm\vaa.asm">
+ <Filter>ASM</Filter>
+ </CustomBuild>
+ </ItemGroup>
+ <ItemGroup>
+ <Filter Include="ASM">
+ <UniqueIdentifier>{18a2a593-cf54-452e-bf69-5eaf9aac6518}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="headers">
+ <UniqueIdentifier>{5a921557-4f54-4838-80de-8c517b1d099b}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="sources">
+ <UniqueIdentifier>{0b628696-109b-4a2e-b11f-5e9e006b76ae}</UniqueIdentifier>
+ </Filter>
+ <Filter Include="resources">
+ <UniqueIdentifier>{94dba5f3-1b39-4ccd-891b-6a70cb59f210}</UniqueIdentifier>
+ </Filter>
+ </ItemGroup>
+ <ItemGroup>
+ <ResourceCompile Include="..\..\src\common\WelsVP.rc">
+ <Filter>resources</Filter>
+ </ResourceCompile>
+ </ItemGroup>
+ <ItemGroup>
+ <None Include="..\..\src\common\WelsVP.def">
+ <Filter>resources</Filter>
+ </None>
+ </ItemGroup>
+</Project>
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/build/win32/WelsVideoProcessor.sln
@@ -1,0 +1,29 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVideoProcessor", "WelsVideoProcessor.vcproj", "{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
+ ProjectSection(ProjectDependencies) = postProject
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.ActiveCfg = Debug|Win32
+ {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.Build.0 = Debug|Win32
+ {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.ActiveCfg = Release|Win32
+ {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.Build.0 = Release|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
+ {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
--- /dev/null
+++ b/codec/processing/build/win32/WelsVideoProcessor.vcproj
@@ -1,0 +1,213 @@
+<?xml version="1.0" encoding="gb2312"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9.00"
+ Name="WelsVideoProcessor"
+ ProjectGUID="{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
+ RootNamespace="WelsVideoProcessor"
+ Keyword="Win32Proj"
+ TargetFrameworkVersion="196613"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
+ IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
+ ConfigurationType="1"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="1"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ DebugInformationFormat="4"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ LinkIncremental="2"
+ GenerateDebugInformation="true"
+ SubSystem="1"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
+ IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
+ ConfigurationType="1"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+ RuntimeLibrary="0"
+ EnableFunctionLevelLinking="true"
+ UsePrecompiledHeader="0"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ LinkIncremental="1"
+ GenerateDebugInformation="true"
+ SubSystem="1"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\..\src\testbed\stdafx.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\testbed\wels_process.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\testbed\WelsVideoProcessor.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\..\src\testbed\stdafx.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\testbed\targetver.h"
+ >
+ </File>
+ <File
+ RelativePath="..\..\src\testbed\wels_process.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Resource Files"
+ Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
+ UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
+ >
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
--- /dev/null
+++ b/codec/processing/interface/IWelsVP.h
@@ -1,0 +1,286 @@
+/*!
+ * \copy
+ * Copyright (c) 2004-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ *
+ * \file : IWelsVP.h
+ *
+ * \brief : Interface of wels video processor class
+ *
+ * \date : 2011/01/04
+ *
+ * \description : 1. should support both C/C++ style interface
+ * 2. should concern with the feature extension requirement
+ * 3. should care the usage of "char"==>
+ * 1) value char : signed char/unsigned char
+ * 2) string char : char
+ *
+ *************************************************************************************
+ */
+
+#ifndef IWELSVP_H_
+#define IWELSVP_H_
+
+#ifdef _WIN32
+#define WELSAPI __stdcall
+#else
+#define WELSAPI
+#endif
+
+#define WELSVP_MAJOR_VERSION 1
+#define WELSVP_MINOR_VERSION 1
+#define WELSVP_VERSION ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
+
+typedef enum {
+ RET_SUCCESS = 0,
+ RET_FAILED = -1,
+ RET_INVALIDPARAM = -2,
+ RET_OUTOFMEMORY = -3,
+ RET_NOTSUPPORTED = -4,
+ RET_UNEXPECTED = -5,
+ RET_NEEDREINIT = -6
+} EResult;
+
+typedef enum {
+ VIDEO_FORMAT_NULL = 0, /* invalid format */
+ /*rgb color formats*/
+ VIDEO_FORMAT_RGB = 1, /* rgb 24bits */
+ VIDEO_FORMAT_RGBA = 2, /* rgba */
+ VIDEO_FORMAT_RGB555 = 3, /* rgb555 */
+ VIDEO_FORMAT_RGB565 = 4, /* rgb565 */
+ VIDEO_FORMAT_BGR = 5, /* bgr 24bits */
+ VIDEO_FORMAT_BGRA = 6, /* bgr 32bits */
+ VIDEO_FORMAT_ABGR = 7, /* abgr */
+ VIDEO_FORMAT_ARGB = 8, /* argb */
+
+ /*yuv color formats*/
+ VIDEO_FORMAT_YUY2 = 20, /* yuy2 */
+ VIDEO_FORMAT_YVYU = 21, /* yvyu */
+ VIDEO_FORMAT_UYVY = 22, /* uyvy */
+ VIDEO_FORMAT_I420 = 23, /* yuv 4:2:0 planar */
+ VIDEO_FORMAT_YV12 = 24, /* yuv 4:2:0 planar */
+ VIDEO_FORMAT_INTERNAL = 25, /* Only Used for SVC decoder testbed */
+ VIDEO_FORMAT_NV12 = 26, /* y planar + uv packed */
+ VIDEO_FORMAT_I422 = 27, /* yuv 4:2:2 planar */
+ VIDEO_FORMAT_I444 = 28, /* yuv 4:4:4 planar */
+ VIDEO_FORMAT_YUYV = 20, /* yuv 4:2:2 packed */
+
+ VIDEO_FORMAT_RGB24 = 1,
+ VIDEO_FORMAT_RGB32 = 2,
+ VIDEO_FORMAT_RGB24_INV = 5,
+ VIDEO_FORMAT_RGB32_INV = 6,
+ VIDEO_FORMAT_RGB555_INV = 7,
+ VIDEO_FORMAT_RGB565_INV = 8,
+ VIDEO_FORMAT_YUV2 = 21,
+ VIDEO_FORMAT_420 = 23,
+
+ VIDEO_FORMAT_VFlip = 0x80000000
+} EVideoFormat;
+
+typedef enum {
+ BUFFER_HOSTMEM = 0,
+ BUFFER_SURFACE
+} EPixMapBufferProperty;
+
+typedef struct {
+ int iRectTop;
+ int iRectLeft;
+ int iRectWidth;
+ int iRectHeight;
+} SRect;
+
+typedef struct {
+ void* pPixel[3];
+ int iSizeInBits;
+ int iStride[3];
+ SRect sRect;
+ EVideoFormat eFormat;
+ EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
+} SPixMap;
+
+typedef enum {
+ METHOD_NULL = 0,
+ METHOD_COLORSPACE_CONVERT ,//not support yet
+ METHOD_DENOISE ,
+ METHOD_SCENE_CHANGE_DETECTION ,
+ METHOD_DOWNSAMPLE ,
+ METHOD_VAA_STATISTICS ,
+ METHOD_BACKGROUND_DETECTION ,
+ METHOD_ADAPTIVE_QUANT ,
+ METHOD_COMPLEXITY_ANALYSIS ,
+ METHOD_IMAGE_ROTATE ,
+ METHOD_MASK
+} EMethods;
+
+//-----------------------------------------------------------------//
+// Algorithm parameters define
+//-----------------------------------------------------------------//
+
+typedef struct {
+ int bSceneChangeFlag; // 0:false ; 1:true
+} SSceneChangeResult;
+
+typedef enum {
+ SIMILAR_SCENE, //similar scene
+ MEDIUM_CHANGED_SCENE, //medium changed scene
+ LARGE_CHANGED_SCENE, //large changed scene
+} ESceneChangeIdc;
+
+typedef struct {
+ unsigned char* pCurY; // Y data of current frame
+ unsigned char* pRefY; // Y data of pRef frame for diff calc
+ int (*pSad8x8)[4]; // sad of 8x8, every 4 in the same 16x16 get together
+ int* pSsd16x16; // sum of square difference of 16x16
+ int* pSum16x16; // sum of 16x16
+ int* pSumOfSquare16x16; // sum of square of 16x16
+ int (*pSumOfDiff8x8)[4];
+ unsigned char (*pMad8x8)[4];
+ int iFrameSad; // sad of frame
+} SVAACalcResult;
+
+typedef struct {
+ int iCalcVar;
+ int iCalcBgd;
+ int iCalcSsd;
+ int iReserved;
+ SVAACalcResult* pCalcResult;
+} SVAACalcParam;
+
+typedef struct {
+ signed char* pBackgroundMbFlag;
+ SVAACalcResult* pCalcRes;
+} SBGDInterface;
+
+typedef enum {
+ AQ_QUALITY_MODE, //Quality mode
+ AQ_BITRATE_MODE, //Bitrate mode
+} EAQModes;
+
+typedef struct {
+ unsigned short uiMotionIndex;
+ unsigned short uiTextureIndex;
+} SMotionTextureUnit;
+
+typedef struct {
+ int iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
+ SVAACalcResult* pCalcResult;
+ SMotionTextureUnit* pMotionTextureUnit;
+
+ signed char* pMotionTextureIndexToDeltaQp;
+ double dAverMotionTextureIndexToDeltaQp;
+} SAdaptiveQuantizationParam;
+
+typedef enum {
+ FRAME_SAD = 0,
+ GOM_SAD = -1,
+ GOM_VAR = -2
+} EComplexityAnalysisMode;
+
+typedef struct {
+ int iComplexityAnalysisMode;
+ int iCalcBgd;
+ int iMbNumInGom;
+ int iFrameComplexity;
+ int* pGomComplexity;
+ int* pGomForegroundBlockNum;
+ signed char* pBackgroundMbFlag;
+ unsigned int* uiRefMbType;
+ SVAACalcResult* pCalcResult;
+} SComplexityAnalysisParam;
+
+/////////////////////////////////////////////////////////////////////////////////////////////
+
+typedef struct {
+ void* pCtx;
+ EResult (*Init) (void* pCtx, int iType, void* pCfg);
+ EResult (*Uninit) (void* pCtx, int iType);
+ EResult (*Flush) (void* pCtx, int iType);
+ EResult (*Process) (void* pCtx, int iType, SPixMap* pSrc, SPixMap* dst);
+ EResult (*Get) (void* pCtx, int iType, void* pParam);
+ EResult (*Set) (void* pCtx, int iType, void* pParam);
+ EResult (*SpecialFeature) (void* pCtx, int iType, void* pIn, void* pOut);
+} IWelsVPc;
+
+#if defined(__cplusplus) && !defined(CINTERFACE) /* C++ style interface */
+
+class IWelsVP {
+ public:
+ virtual ~IWelsVP() {}
+
+ public:
+ virtual EResult Init (int iType, void* pCfg) = 0;
+ virtual EResult Uninit (int iType) = 0;
+ virtual EResult Flush (int iType) = 0;
+ virtual EResult Process (int iType, SPixMap* pSrc, SPixMap* dst) = 0;
+ virtual EResult Get (int iType, void* pParam) = 0;
+ virtual EResult Set (int iType, void* pParam) = 0;
+ virtual EResult SpecialFeature (int iType, void* pIn, void* pOut) = 0;
+};
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b) (p)->Init(a, b)
+#define IWelsVPFunc_Uninit(p, a) (p)->Uninit(a)
+#define IWelsVPFunc_Flush(p, a) (p)->Flush(a)
+#define IWelsVPFunc_Process(p, a, b, c) (p)->Process(a, b, c)
+#define IWelsVPFunc_Get(p, a, b) (p)->Get(a, b)
+#define IWelsVPFunc_Set(p, a, b) (p)->Set(a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c) (p)->SpecialFeature(a, b, c)
+
+/* C++ interface version */
+#define WELSVP_INTERFACE_VERION (0x8000 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN extern "C" {
+#define WELSVP_EXTERNC_END }
+
+#else /* C style interface */
+
+/* Recommend to invoke the interface via the micro for convenient */
+#define IWelsVPFunc_Init(p, a, b) (p)->Init(p->h, a, b)
+#define IWelsVPFunc_Uninit(p, a) (p)->Uninit(p->h, a)
+#define IWelsVPFunc_Flush(p, a) (p)->Flush(p->h, a)
+#define IWelsVPFunc_Process(p, a, b, c) (p)->Process(p->h, a, b, c)
+#define IWelsVPFunc_Get(p, a, b) (p)->Get(p->h, a, b)
+#define IWelsVPFunc_Set(p, a, b) (p)->Set(p->h, a, b)
+#define IWelsVPFunc_SpecialFeature(p, a, b, c) (p)->SpecialFeature(p->h, a, b, c)
+
+/* C interface version */
+#define WELSVP_INTERFACE_VERION (0x0001 + (WELSVP_VERSION & 0x7fff))
+#define WELSVP_EXTERNC_BEGIN
+#define WELSVP_EXTERNC_END
+
+#endif
+
+WELSVP_EXTERNC_BEGIN
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
+EResult WELSAPI DestroyVpInterface (void* pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
+WELSVP_EXTERNC_END
+
+//////////////////////////////////////////////////////////////////////////////////////////////
+#endif // IWELSVP_H_
+
+
--- /dev/null
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -1,0 +1,256 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#include "AdaptiveQuantization.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+
+#define AVERAGE_TIME_MOTION (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
+#define AVERAGE_TIME_TEXTURE_QUALITYMODE (1.0) //0.5 // 1/2
+#define AVERAGE_TIME_TEXTURE_BITRATEMODE (0.875) //0.5 // 1/2
+#define MODEL_ALPHA (0.9910) //1.5 //1.1102
+#define MODEL_TIME (5.8185) //9.0 //5.9842
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
+ m_CPUFlag = iCpuFlag;
+ m_eMethod = METHOD_ADAPTIVE_QUANT;
+ m_pfVar = NULL;
+ WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
+ WelsInitVarFunc (m_pfVar, m_CPUFlag);
+}
+
+CAdaptiveQuantization::~CAdaptiveQuantization() {
+}
+
+EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ EResult eReturn = RET_INVALIDPARAM;
+
+ int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
+ int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
+ int32_t iMbWidth = iWidth >> 4;
+ int32_t iMbHeight = iHeight >> 4;
+ int32_t iMbTotalNum = iMbWidth * iMbHeight;
+
+ SMotionTextureUnit* pMotionTexture = NULL;
+ SVAACalcResult* pVaaCalcResults = NULL;
+ int8_t iMotionTextureIndexToDeltaQp = 0;
+ int32_t iAverMotionTextureIndexToDeltaQp = 0; // double to uint32
+ double_t dAverageMotionIndex = 0.0; // double to float
+ double_t dAverageTextureIndex = 0.0;
+
+ double_t dQStep = 0.0;
+ double_t dLumaMotionDeltaQp = 0;
+ double_t dLumaTextureDeltaQp = 0;
+
+ uint8_t* pRefFrameY = NULL, *pCurFrameY = NULL;
+ int32_t iRefStride = 0, iCurStride = 0;
+
+ uint8_t* pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
+ int32_t i = 0, j = 0;
+
+ pRefFrameY = (uint8_t*)pRefPixMap->pPixel[0];
+ pCurFrameY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+ iRefStride = pRefPixMap->iStride[0];
+ iCurStride = pSrcPixMap->iStride[0];
+
+ /////////////////////////////////////// motion //////////////////////////////////
+ // motion MB residual variance
+ dAverageMotionIndex = 0.0;
+ dAverageTextureIndex = 0.0;
+ pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+ pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
+
+ if (pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY) {
+ int32_t iMbIndex = 0;
+ int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
+ for (j = 0; j < iMbHeight; j ++) {
+ pRefFrameTmp = pRefFrameY;
+ pCurFrameTmp = pCurFrameY;
+ for (i = 0; i < iMbWidth; i++) {
+ iSumDiff = pVaaCalcResults->pSad8x8[iMbIndex][0];
+ iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
+ iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
+ iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
+
+ iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
+ uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
+ iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
+
+ iSumDiff = iSumDiff >> 8;
+ pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
+
+ uiSum = uiSum >> 8;
+ pMotionTexture->uiTextureIndex = (iSQSum >> 8) - (uiSum * uiSum);
+
+ dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+ dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+ pMotionTexture++;
+ ++iMbIndex;
+ pRefFrameTmp += MB_WIDTH_LUMA;
+ pCurFrameTmp += MB_WIDTH_LUMA;
+ }
+ pRefFrameY += (iRefStride) << 4;
+ pCurFrameY += (iCurStride) << 4;
+ }
+ } else {
+ for (j = 0; j < iMbHeight; j ++) {
+ pRefFrameTmp = pRefFrameY;
+ pCurFrameTmp = pCurFrameY;
+ for (i = 0; i < iMbWidth; i++) {
+ m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
+ dAverageMotionIndex += pMotionTexture->uiMotionIndex;
+ dAverageTextureIndex += pMotionTexture->uiTextureIndex;
+ pMotionTexture++;
+ pRefFrameTmp += MB_WIDTH_LUMA;
+ pCurFrameTmp += MB_WIDTH_LUMA;
+
+ }
+ pRefFrameY += (iRefStride) << 4;
+ pCurFrameY += (iCurStride) << 4;
+ }
+ }
+ dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
+ dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
+ if ((dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN)) {
+ dAverageMotionIndex = 1.0;
+ }
+ if ((dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN)) {
+ dAverageTextureIndex = 1.0;
+ }
+ // motion mb residual map to QP
+ // texture mb original map to QP
+ iAverMotionTextureIndexToDeltaQp = 0;
+ dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
+
+ if (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE) {
+ dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
+ } else {
+ dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
+ }
+
+ pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
+ for (j = 0; j < iMbHeight; j ++) {
+ for (i = 0; i < iMbWidth; i++) {
+ double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
+ dQStep = (a - 1) / (a + MODEL_ALPHA);
+ dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+ iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
+
+ a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
+ dQStep = (a - 1) / (a + MODEL_ALPHA);
+ dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
+
+ if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN)
+ || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE)) {
+ iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
+ }
+
+ m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
+ iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
+ pMotionTexture++;
+ }
+ }
+ m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
+
+ eReturn = RET_SUCCESS;
+
+ return eReturn;
+}
+
+
+
+EResult CAdaptiveQuantization::Set (int32_t iType, void* pParam) {
+ if (pParam == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ m_sAdaptiveQuantParam = * (SAdaptiveQuantizationParam*)pParam;
+
+ return RET_SUCCESS;
+}
+
+EResult CAdaptiveQuantization::Get (int32_t iType, void* pParam) {
+ if (pParam == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ SAdaptiveQuantizationParam* sAdaptiveQuantParam = (SAdaptiveQuantizationParam*)pParam;
+
+ sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
+
+ return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag) {
+ pfVar = SampleVariance16x16_c;
+
+#ifdef X86_ASM
+ if (iCpuFlag & WELS_CPU_SSE2) {
+ // pfVar = SampleVariance16x16_sse2;
+ }
+#endif
+}
+
+void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
+ SMotionTextureUnit* pMotionTexture) {
+ uint32_t uiCurSquare = 0, uiSquare = 0;
+ uint16_t uiCurSum = 0, uiSum = 0;
+
+ for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {
+ for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {
+ uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);
+ uiSum += uiDiff;
+ uiSquare += uiDiff * uiDiff;
+
+ uiCurSum += pSrcY[x];
+ uiCurSquare += pSrcY[x] * pSrcY[x];
+ }
+ pRefY += iRefStride;
+ pSrcY += iSrcStride;
+ }
+
+ uiSum = uiSum >> 8;
+ pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);
+
+ uiCurSum = uiCurSum >> 8;
+ pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : AdaptiveQuantization.h
+ *
+ * \brief : adaptive quantization class of wels video processor class
+ *
+ * \date : 2011/03/21
+ *
+ * \description : 1. rewrite the package code of scene change detection class
+ *
+ */
+
+#ifndef WELSVP_ADAPTIVEQUANTIZATION_H
+#define WELSVP_ADAPTIVEQUANTIZATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VarFunc) (uint8_t* pRefY, int32_t iRefStrideY, uint8_t* pSrc, int32_t iSrcStrideY,
+ SMotionTextureUnit* pMotionTexture);
+
+typedef VarFunc* PVarFunc;
+
+VarFunc SampleVariance16x16_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VarFunc SampleVariance16x16_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+
+class CAdaptiveQuantization : public IStrategy {
+ public:
+ CAdaptiveQuantization (int32_t iCpuFlag);
+ ~CAdaptiveQuantization();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+ EResult Set (int32_t iType, void* pParam);
+ EResult Get (int32_t iType, void* pParam);
+
+ private:
+ void WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag);
+
+ private:
+ PVarFunc m_pfVar;
+ int32_t m_CPUFlag;
+ SAdaptiveQuantizationParam m_sAdaptiveQuantParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -1,0 +1,279 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* predenoise.asm
+;*
+;* Abstract
+;* denoise for SVC2.1
+;* History
+;* 4/13/2010 Created
+;* 7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+%macro WEIGHT_LINE 9
+ movq %2, %9
+ punpcklbw %2, %7
+ movdqa %8, %2
+
+ movdqa %1, %6
+ psubusb %1, %8
+ psubusb %8, %6
+ por %8, %1 ; ABS(curPixel - centerPixel);
+
+ movdqa %1, %3
+ psubusb %1, %8
+
+ pmullw %1, %1
+ psrlw %1, 5
+ pmullw %2, %1
+ paddusw %4, %1
+ paddusw %5, %2
+%endmacro
+
+%macro WEIGHT_LINE1_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE2_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ paddw %3, %2
+%endmacro
+
+%macro WEIGHT_LINE3_UV 4
+ movdqa %2, %1
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 1
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 2
+ punpcklbw %2, %4
+ pmullw %2, [sse2_20]
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 3
+ punpcklbw %2, %4
+ psllw %2, 2
+ paddw %3, %2
+
+ movdqa %2, %1
+ psrldq %2, 4
+ punpcklbw %2, %4
+ psllw %2, 1
+ paddw %3, %2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+; 1 2 3
+; 4 0 5
+; 6 7 8
+; 0: the center point
+%define pushsize 4
+;%define pixel esp + pushsize + 4
+;%define stride esp + pushsize + 8
+;%define pixel r0
+;%define stride r1
+
+BilateralLumaFilter8_sse2:
+
+ push r3
+ %assign push_num 1
+ LOAD_2_PARA
+
+ pxor xmm7, xmm7
+
+ mov r3, r0
+
+ movq xmm6, [r0]
+ punpcklbw xmm6, xmm7
+ movdqa xmm3, [sse2_32]
+ pxor xmm4, xmm4 ; nTotWeight
+ pxor xmm5, xmm5 ; nSum
+
+ dec r0
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
+
+ sub r0, r1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
+
+ lea r0, [r0 + r1 * 2]
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
+ WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
+
+ pcmpeqw xmm0, xmm0
+ psrlw xmm0, 15
+ psllw xmm0, 8
+ psubusw xmm0, xmm4
+ pmullw xmm0, xmm6
+ paddusw xmm5, xmm0
+ psrlw xmm5, 8
+ packuswb xmm5, xmm5
+ movq [r3], xmm5
+
+
+ pop r3
+ %assign push_num 0
+
+ ret
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1 1 2 1 1
+;1 2 4 2 1
+;2 4 20 4 2
+;1 2 4 2 1
+;1 1 2 1 1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+
+ push r3
+
+ %assign push_num 1
+
+ LOAD_2_PARA
+
+ mov r3, r1
+ add r3, r3
+ sub r0, r3 ; pixels - 2 * stride
+ sub r0, 2
+
+ pxor xmm0, xmm0
+ pxor xmm3, xmm3
+
+ movdqu xmm1, [r0]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ add r0, r3
+ movdqu xmm1, [r0]
+ WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [r0 + r1]
+ WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
+
+ movdqu xmm1, [r0 + r1 * 2]
+ WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
+
+ psrlw xmm3, 6
+ packuswb xmm3, xmm3
+ movq [r0 + 2], xmm3
+
+
+ pop r3
+
+ %assign push_num 0
+ ret
--- /dev/null
+++ b/codec/processing/src/asm/downsample_bilinear.asm
@@ -1,0 +1,1225 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* upsampling.asm
+;*
+;* Abstract
+;* SIMD for pixel domain down sampling
+;*
+;* History
+;* 10/22/2009 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+%ifdef X86_32
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+ db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+ db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ ; 2nd part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm1, [esi+16] ; 1st pSrc line + 16
+ movq mm2, [esi+24] ; 1st pSrc line + 24
+ movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
+ movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
+
+ ; to handle mm1, mm2, mm3, mm4
+ pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm5, mm6 ; d c D C b a B A
+ pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm6, mm7 ; h g H G f e F E
+ pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm7, mm1 ; l k L K j i J I
+ pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
+
+ pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm1, mm2 ; p o P O n m N M
+ pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
+
+ ; to handle mm5, mm6, mm7, mm1
+ movq mm2, mm5
+ punpckldq mm2, mm6 ; H G F E D C B A
+ punpckhdq mm5, mm6 ; h g f e d c b a
+
+ movq mm3, mm7
+ punpckldq mm3, mm1 ; P O N M L K J I
+ punpckhdq mm7, mm1 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+ movq [edi ], mm0
+ movq [edi+8], mm2
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
+ ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+8] ; 1st pSrc line + 8
+ movq mm2, [esi+ecx] ; 2nd pSrc line
+ movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm4, mm5 ; d c D C b a B A
+ pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm5, mm6 ; h g H G f e F E
+ pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
+ pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
+ punpcklbw mm6, mm7 ; l k L K j i J I
+ pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
+
+ pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
+ pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
+ punpcklbw mm7, mm0 ; p o P O n m N M
+ pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
+
+ ; to handle mm4, mm5, mm6, mm7
+ movq mm0, mm4 ;
+ punpckldq mm0, mm5 ; H G F E D C B A
+ punpckhdq mm4, mm5 ; h g f e d c b a
+
+ movq mm1, mm6
+ punpckldq mm1, mm7 ; P O N M L K J I
+ punpckhdq mm6, mm7 ; p o n m l k j i
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+ pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movq [edi ], mm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 8 bytes
+.xloops:
+ ; 1st part horizonal loop: x8 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: mm0: d D c C b B a A
+ ;2nd Line Src: mm1: h H g G f F e E
+ ;=> target:
+ ;: H G F E D C B A
+ ;: h g f e d c b a
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movq mm0, [esi] ; 1st pSrc line
+ movq mm1, [esi+ecx] ; 2nd pSrc line
+
+ ; to handle mm0, mm1, mm2, mm3
+ pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
+ pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
+ punpcklbw mm2, mm3 ; d c D C b a B A
+ pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
+
+ pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
+ pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
+ punpcklbw mm4, mm5 ; h g H G f e F E
+ pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
+
+ ; to handle mm2, mm4
+ movq mm0, mm2 ;
+ punpckldq mm0, mm4 ; H G F E D C B A
+ punpckhdq mm2, mm4 ; h g f e d c b a
+
+ ; avg within MB horizon width (16 x 2 lines)
+ pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+ pshufw mm1, mm0, 04eh ; 01001110 B
+ pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+ movd [edi], mm0
+
+ ; next unit
+ lea esi, [esi+8]
+ lea edi, [edi+4]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ WELSEMMS
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm4 high bits
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqa xmm0, [esi] ; 1st_src_line
+ movdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ ; another implementation for xmm2 high bits
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 32 bytes
+.xloops:
+ ; 1st part horizonal loop: x16 bytes
+ ; mem hi<- ->lo
+ ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
+ ; xmm1: p P o O n N m M l L k K j J i I
+ ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
+ ; xmm3: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: P O N M L K J I H G F E D C B A
+ ;: p o n m l k j i h g f e d c b a
+ ;: P .. A
+ ;: p .. a
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+16] ; 1st_src_line + 16
+ movntdqa xmm2, [esi+ecx] ; 2nd_src_line
+ movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
+
+ ; packing & avg
+ movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm4
+
+ movdqa xmm5, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm1
+; psrlw xmm5, 8
+ pavgb xmm1, xmm5
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, xmm7
+ pshufb xmm4, xmm6
+; psubb xmm4, xmm2
+; psrlw xmm4, 8
+ pavgb xmm2, xmm4
+
+ movdqa xmm5, xmm3
+ pshufb xmm3, xmm7
+ pshufb xmm5, xmm6
+; psubb xmm5, xmm3
+; psrlw xmm5, 8
+ pavgb xmm3, xmm5
+
+ packuswb xmm0, xmm1
+ packuswb xmm2, xmm3
+ pavgb xmm0, xmm2
+
+ ; write pDst
+ movdqa [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+32]
+ lea edi, [edi+16]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+; unsigned char* pSrc, const int iSrcStride,
+; const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+ push ebx
+ push edx
+ push esi
+ push edi
+ push ebp
+
+ mov edi, [esp+24] ; pDst
+ mov edx, [esp+28] ; iDstStride
+ mov esi, [esp+32] ; pSrc
+ mov ecx, [esp+36] ; iSrcStride
+ mov ebp, [esp+44] ; iSrcHeight
+
+ sar ebp, $1 ; iSrcHeight >> 1
+ movdqa xmm7, [shufb_mask_low] ; mask low
+ movdqa xmm6, [shufb_mask_high] ; mask high
+
+.yloops:
+ mov eax, [esp+40] ; iSrcWidth
+ sar eax, $1 ; iSrcWidth >> 1
+ mov ebx, eax ; iDstWidth restored at ebx
+ sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
+ neg ebx ; - (iSrcWidth >> 1)
+ ; each loop = source bandwidth: 16 bytes
+.xloops:
+ ; horizonal loop: x16 bytes by source
+ ; mem hi<- ->lo
+ ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
+ ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
+ ;=> target:
+ ;: H G F E D C B A, P O N M L K J I
+ ;: h g f e d c b a, p o n m l k j i
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movntdqa xmm0, [esi] ; 1st_src_line
+ movntdqa xmm1, [esi+ecx] ; 2nd_src_line
+
+ ; packing & avg
+ movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
+ pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+ pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+ pavgb xmm0, xmm2
+
+ movdqa xmm3, xmm1
+ pshufb xmm1, xmm7
+ pshufb xmm3, xmm6
+; psubb xmm3, xmm1
+; psrlw xmm3, 8
+ pavgb xmm1, xmm3
+
+ pavgb xmm0, xmm1
+ packuswb xmm0, xmm1
+
+ ; write pDst
+ movq [edi], xmm0
+
+ ; next SMB
+ lea esi, [esi+16]
+ lea edi, [edi+8]
+
+ dec eax
+ jg near .xloops
+
+ ; next line
+ lea esi, [esi+2*ecx] ; next end of lines
+ lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
+ lea edi, [edi+edx]
+ lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
+
+ dec ebp
+ jg near .yloops
+
+ pop ebp
+ pop edi
+ pop esi
+ pop edx
+ pop ebx
+ ret
+
+
+
+
+
+WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 32767
+ mov eax, [uiScaleX]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm1, eax ; uinc(uiScaleX mod 32767)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
+ pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
+ pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 40003fffh
+ movd xmm5, edx
+ punpcklwd xmm5, xmm0 ; 16384 16383
+ pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
+
+
+DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
+
+HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
+
+WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ pxor xmm0, xmm0
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+ punpcklwd xmm1, xmm0 ; 000d000c000b000a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ movdqa xmm0, xmm2
+ pmuludq xmm2, xmm1
+ psrlq xmm0, 32
+ psrlq xmm1, 32
+ pmuludq xmm0, xmm1
+ paddq xmm2, xmm0
+ pshufd xmm1, xmm2, 00001110b
+ paddq xmm2, xmm1
+ psrlq xmm2, 29
+
+ movd eax, xmm2
+ inc eax
+ shr eax, 1
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+ psllw xmm3, 1
+ psrlw xmm3, 1
+
+ loop WIDTH
+
+WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 15
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg HEIGHT
+
+
+LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 16384
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 15
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
+
+
+
+
+WELS_EXTERN GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+; unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+ push ebp
+ push esi
+ push edi
+ push ebx
+%define pushsize 16
+%define localsize 28
+%define pDstData esp + pushsize + localsize + 4
+%define dwDstStride esp + pushsize + localsize + 8
+%define dwDstWidth esp + pushsize + localsize + 12
+%define dwDstHeight esp + pushsize + localsize + 16
+%define pSrcData esp + pushsize + localsize + 20
+%define dwSrcStride esp + pushsize + localsize + 24
+%define dwSrcWidth esp + pushsize + localsize + 28
+%define dwSrcHeight esp + pushsize + localsize + 32
+%define scale esp + 0
+%define uiScaleX esp + pushsize + localsize + 36
+%define uiScaleY esp + pushsize + localsize + 40
+%define tmpHeight esp + 12
+%define yInverse esp + 16
+%define xInverse esp + 20
+%define dstStep esp + 24
+ sub esp, localsize
+
+ pxor xmm0, xmm0
+ mov edx, 65535
+ mov eax, [uiScaleX]
+ and eax, edx
+ mov ebx, eax
+ neg ebx
+ and ebx, 65535
+ movd xmm1, eax ; uinc(uiScaleX mod 65536)
+ movd xmm2, ebx ; -uinc
+ psllq xmm1, 32
+ por xmm1, xmm2 ; 0 uinc 0 -uinc
+ pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
+
+ mov eax, [uiScaleY]
+ and eax, 32767
+ mov ebx, eax
+ neg ebx
+ and ebx, 32767
+ movd xmm6, eax ; vinc(uiScaleY mod 32767)
+ movd xmm2, ebx ; -vinc
+ psllq xmm6, 32
+ por xmm6, xmm2 ; 0 vinc 0 -vinc
+ pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
+
+ mov edx, 80007fffh ; 32768 32767
+ movd xmm5, edx
+ pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
+ mov ebx, 16384
+
+
+FAST_DOWNSAMPLE:
+
+ mov eax, [dwDstHeight]
+ mov edi, [pDstData]
+ mov edx, [dwDstStride]
+ mov ecx, [dwDstWidth]
+ sub edx, ecx
+ mov [dstStep], edx ; stride - width
+ dec eax
+ mov [tmpHeight], eax
+ mov eax, 16384
+ mov [yInverse], eax
+
+ pshuflw xmm4, xmm5, 01010000b
+ psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
+
+FAST_HEIGHT:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+ mov ebp, esi
+ add ebp, [dwSrcStride]
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+ dec ecx
+
+ movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
+
+FAST_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ movd xmm1, [esi+eax] ; xxxxxxba
+ movd xmm2, [ebp+eax] ; xxxxxxdc
+ punpcklwd xmm1, xmm2 ; xxxxdcba
+ punpcklbw xmm1, xmm0 ; 0d0c0b0a
+
+ movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
+ pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
+ pmaddwd xmm2, xmm1
+ pshufd xmm1, xmm2, 00000001b
+ paddd xmm2, xmm1
+ movd xmm1, ebx
+ paddd xmm2, xmm1
+ psrld xmm2, 15
+
+ packuswb xmm2, xmm0
+ movd eax, xmm2
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ paddw xmm3, xmm7 ; inc u
+
+ loop FAST_WIDTH
+
+FAST_WIDTH_END:
+ mov eax, [xInverse]
+ shr eax, 16
+ mov cl, [esi+eax]
+ mov [edi], cl
+ inc edi
+
+ mov eax, [uiScaleY]
+ add [yInverse], eax
+ add edi, [dstStep]
+
+ paddw xmm4, xmm6 ; inc v
+ psllw xmm4, 1
+ psrlw xmm4, 1
+
+ dec dword [tmpHeight]
+ jg FAST_HEIGHT
+
+
+FAST_LAST_ROW:
+ mov eax, [yInverse]
+ mov esi, [pSrcData]
+ shr eax, 15
+ mul dword [dwSrcStride]
+ add esi, eax ; get current row address
+
+ mov eax, 32768
+ mov [xInverse], eax
+ mov ecx, [dwDstWidth]
+
+FAST_LAST_ROW_WIDTH:
+ mov eax, [xInverse]
+ shr eax, 16
+
+ mov al, [esi+eax]
+ mov [edi], al
+ inc edi
+
+ mov eax, [uiScaleX]
+ add [xInverse], eax
+
+ loop FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef pushsize
+%undef localsize
+%undef pSrcData
+%undef dwSrcWidth
+%undef dwSrcHeight
+%undef dwSrcStride
+%undef pDstData
+%undef dwDstWidth
+%undef dwDstHeight
+%undef dwDstStride
+%undef scale
+%undef uiScaleX
+%undef uiScaleY
+%undef tmpHeight
+%undef yInverse
+%undef xInverse
+%undef dstStep
+ ret
+%endif
--- /dev/null
+++ b/codec/processing/src/asm/intra_pred.asm
@@ -1,0 +1,1505 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* intra_pred.asm
+;*
+;* Abstract
+;* sse2 function for intra predict operations
+;*
+;* History
+;* 18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+align 16
+sse2_plane_inc_minus dw -7, -6, -5, -4, -3, -2, -1, 0
+align 16
+sse2_plane_inc dw 1, 2, 3, 4, 5, 6, 7, 8
+align 16
+sse2_plane_dec dw 8, 7, 6, 5, 4, 3, 2, 1
+
+; for chroma plane mode
+sse2_plane_inc_c dw 1, 2, 3, 4
+sse2_plane_dec_c dw 4, 3, 2, 1
+align 16
+sse2_plane_mul_b_c dw -3, -2, -1, 0, 1, 2, 3, 4
+
+align 16
+mmx_01bytes: times 16 db 1
+;align 16
+;sse_0x0004bytes: times 8 dw 4
+;ALIGN 16
+;sse_f000 db 255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+
+align 16
+mmx_0x02: dw 0x02, 0x00, 0x00, 0x00
+
+
+;***********************************************************************
+; macros
+;***********************************************************************
+;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+;%1 will keep the last result
+%macro SSE_DB_1_2REG 2
+ pxor %1, %1
+ pcmpeqw %2, %2
+ psubb %1, %2
+%endmacro
+
+;xmm0, xmm1, xmm2, eax, ecx
+;lower 64 bits of xmm0 save the result
+%macro SSE2_PRED_H_4X4_TWO_LINE 5
+ movd %1, [%4-1]
+ movdqa %3, %1
+ punpcklbw %1, %3
+ movdqa %3, %1
+ punpcklbw %1, %3
+
+ ;add %4, %5
+ movd %2, [%4+%5-1]
+ movdqa %3, %2
+ punpcklbw %2, %3
+ movdqa %3, %2
+ punpcklbw %2, %3
+ punpckldq %1, %2
+%endmacro
+
+%macro SUMW_HORIZON1 2
+ movdqa %2, %1
+ psrldq %2, 8
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 4
+ paddusw %1, %2
+ movdqa %2, %1
+ psrldq %2, 2
+ paddusw %1, %2
+%endmacro
+
+%macro LOAD_COLUMN 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpcklwd %1, %3
+ lea %5, [%5+2*%6]
+ movd %4, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %4, %2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ lea %5, [%5+2*%6]
+ punpcklbw %3, %2
+ punpcklwd %4, %3
+ punpckhdq %1, %4
+%endmacro
+
+%macro SUMW_HORIZON 3
+ movhlps %2, %1 ; x2 = xx xx xx xx d7 d6 d5 d4
+ paddw %1, %2 ; x1 = xx xx xx xx d37 d26 d15 d04
+ punpcklwd %1, %3 ; x1 = d37 d26 d15 d04
+ movhlps %2, %1 ; x2 = xxxx xxxx d37 d26
+ paddd %1, %2 ; x1 = xxxx xxxx d1357 d0246
+ pshuflw %2, %1, 0x4e ; x2 = xxxx xxxx d0246 d1357
+ paddd %1, %2 ; x1 = xxxx xxxx xxxx d01234567
+%endmacro
+
+
+%macro COPY_16_TIMES 2
+ movdqa %2, [%1-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro COPY_16_TIMESS 3
+ movdqa %2, [%1+%3-16]
+ psrldq %2, 15
+ pmuludq %2, [mmx_01bytes]
+ pshufd %2, %2, 0
+%endmacro
+
+%macro LOAD_COLUMN_C 6
+ movd %1, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %1,%2
+ lea %5, [%5+2*%6]
+ movd %3, [%5]
+ movd %2, [%5+%6]
+ punpcklbw %3, %2
+ punpckhwd %1, %3
+ lea %5, [%5+2*%6]
+%endmacro
+
+%macro LOAD_2_LEFT_AND_ADD 0
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01]
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01]
+ add r3, r4
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+WELS_EXTERN WelsI4x4LumaPredH_sse2
+WELS_EXTERN WelsI4x4LumaPredDDR_mmx
+WELS_EXTERN WelsI4x4LumaPredDc_sse2
+WELS_EXTERN WelsI16x16LumaPredPlane_sse2
+
+ALIGN 16
+;***********************************************************************
+; void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;
+; pred must align to 16
+;***********************************************************************
+WelsI4x4LumaPredH_sse2:
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movzx r3, byte [r1-1]
+ movd xmm0, r3d
+ pmuludq xmm0, [mmx_01bytes]
+
+ movzx r3, byte [r1+r2-1]
+ movd xmm1, r3d
+ pmuludq xmm1, [mmx_01bytes]
+
+ unpcklps xmm0, xmm1
+
+ lea r1, [r1+r2*2]
+ movzx r3, byte [r1-1]
+ movd xmm2, r3d
+ pmuludq xmm2, [mmx_01bytes]
+
+ movzx r3, byte [r1+r2-1]
+ movd xmm3, r3d
+ pmuludq xmm3, [mmx_01bytes]
+
+ unpcklps xmm2, xmm3
+ unpcklpd xmm0, xmm2
+
+ movdqa [r0], xmm0
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WelsI16x16LumaPredPlane_sse2:
+ ;%define pushsize 4
+ ;push esi
+ ;mov esi, [esp + pushsize + 8]
+ ;mov ecx, [esp + pushsize + 12]
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, 1
+ sub r1, r2
+
+ ;for H
+ pxor xmm7, xmm7
+ movq xmm0, [r1]
+ movdqa xmm5, [sse2_plane_dec]
+ punpcklbw xmm0, xmm7
+ pmullw xmm0, xmm5
+ movq xmm1, [r1 + 9]
+ movdqa xmm6, [sse2_plane_inc]
+ punpcklbw xmm1, xmm7
+ pmullw xmm1, xmm6
+ psubw xmm1, xmm0
+
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1 ; H += (i + 1) * (top[8 + i] - top[6 - i]);
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; b = (5 * H + 32) >> 6;
+ SSE2_Copy8Times xmm1, r3d ; xmm1 = b,b,b,b,b,b,b,b
+
+ movzx r4, BYTE [r1+16]
+ sub r1, 3
+ LOAD_COLUMN xmm0, xmm2, xmm3, xmm4, r1, r2
+
+ add r1, 3
+ movzx r3, BYTE [r1+8*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[15*stride] + top[15]) << 4;
+
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN xmm7, xmm2, xmm3, xmm4, r1, r2
+ pxor xmm4, xmm4
+ punpckhbw xmm0, xmm4
+ pmullw xmm0, xmm5
+ punpckhbw xmm7, xmm4
+ pmullw xmm7, xmm6
+ psubw xmm7, xmm0
+
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 5
+ add r3, 32
+ sar r3, 6 ; c = (5 * V + 32) >> 6;
+ SSE2_Copy8Times xmm4, r3d ; xmm4 = c,c,c,c,c,c,c,c
+
+ ;mov esi, [esp + pushsize + 4]
+ add r4, 16
+ imul r3, -7
+ add r3, r4 ; s = a + 16 + (-7)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_inc_minus]
+
+get_i16x16_luma_pred_plane_sse2_1:
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ movdqa xmm3, xmm1
+ pmullw xmm3, xmm6
+ paddw xmm3, xmm0
+ psraw xmm3, 5
+ packuswb xmm2, xmm3
+ movdqa [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 16
+ inc r3
+ cmp r3, 16
+ jnz get_i16x16_luma_pred_plane_sse2_1
+ pop r4
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_ONE_LINE 0
+ add r0, 16
+ add r1, r2
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+ push r3
+ %assign push_num 1
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ dec r1
+ movzx r3, byte [r1]
+ SSE2_Copy16Times xmm0, r3d
+ movdqa [r0], xmm0
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ SSE2_PRED_H_16X16_ONE_LINE
+ pop r3
+ ret
+
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+ ;mov edx, [esp+4] ; pred
+ ;mov eax, [esp+8] ; pRef
+ ;mov ecx, [esp+12] ; stride
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movdqa xmm0, [r1]
+
+ movdqa [r0], xmm0
+ movdqa [r0+10h], xmm0
+ movdqa [r0+20h], xmm0
+ movdqa [r0+30h], xmm0
+ movdqa [r0+40h], xmm0
+ movdqa [r0+50h], xmm0
+ movdqa [r0+60h], xmm0
+ movdqa [r0+70h], xmm0
+ movdqa [r0+80h], xmm0
+ movdqa [r0+90h], xmm0
+ movdqa [r0+160], xmm0
+ movdqa [r0+176], xmm0
+ movdqa [r0+192], xmm0
+ movdqa [r0+208], xmm0
+ movdqa [r0+224], xmm0
+ movdqa [r0+240], xmm0
+
+ ret
+
+;***********************************************************************
+; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredPlane_sse2
+WelsIChromaPredPlane_sse2:
+ ;%define pushsize 4
+ ;push esi
+ ;mov esi, [esp + pushsize + 8] ;pRef
+ ;mov ecx, [esp + pushsize + 12] ;stride
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, 1
+ sub r1, r2
+
+ pxor mm7, mm7
+ movq mm0, [r1]
+ movq mm5, [sse2_plane_dec_c]
+ punpcklbw mm0, mm7
+ pmullw mm0, mm5
+ movq mm1, [r1 + 5]
+ movq mm6, [sse2_plane_inc_c]
+ punpcklbw mm1, mm7
+ pmullw mm1, mm6
+ psubw mm1, mm0
+
+ movq2dq xmm1, mm1
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm1,xmm0,xmm2
+ movd r3d, xmm1
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; b = (17 * H + 16) >> 5;
+ SSE2_Copy8Times xmm1, r3d ; mm1 = b,b,b,b,b,b,b,b
+
+ movzx r3, BYTE [r1+8]
+ sub r1, 3
+ LOAD_COLUMN_C mm0, mm2, mm3, mm4, r1, r2
+
+ add r1, 3
+ movzx r4, BYTE [r1+4*r2]
+ add r4, r3
+ shl r4, 4 ; a = (left[7*stride] + top[7]) << 4;
+
+ sub r1, 3
+ add r1, r2
+ LOAD_COLUMN_C mm7, mm2, mm3, mm4, r1, r2
+ pxor mm4, mm4
+ punpckhbw mm0, mm4
+ pmullw mm0, mm5
+ punpckhbw mm7, mm4
+ pmullw mm7, mm6
+ psubw mm7, mm0
+
+ movq2dq xmm7, mm7
+ pxor xmm2, xmm2
+ SUMW_HORIZON xmm7,xmm0,xmm2
+ movd r3d, xmm7 ; V
+ movsx r3, r3w
+ imul r3, 17
+ add r3, 16
+ sar r3, 5 ; c = (17 * V + 16) >> 5;
+ SSE2_Copy8Times xmm4, r3d ; mm4 = c,c,c,c,c,c,c,c
+
+ ;mov esi, [esp + pushsize + 4]
+ add r4, 16
+ imul r3, -3
+ add r3, r4 ; s = a + 16 + (-3)*c
+ SSE2_Copy8Times xmm0, r3d ; xmm0 = s,s,s,s,s,s,s,s
+
+ xor r3, r3
+ movdqa xmm5, [sse2_plane_mul_b_c]
+
+get_i_chroma_pred_plane_sse2_1:
+ movdqa xmm2, xmm1
+ pmullw xmm2, xmm5
+ paddw xmm2, xmm0
+ psraw xmm2, 5
+ packuswb xmm2, xmm2
+ movq [r0], xmm2
+ paddw xmm0, xmm4
+ add r0, 8
+ inc r3
+ cmp r3, 8
+ jnz get_i_chroma_pred_plane_sse2_1
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
+
+ALIGN 16
+;***********************************************************************
+; 0 |1 |2 |3 |4 |
+; 6 |7 |8 |9 |10|
+; 11|12|13|14|15|
+; 16|17|18|19|20|
+; 21|22|23|24|25|
+; 7 is the start pixel of current 4x4 block
+; pred[7] = ([6]+[0]*2+[1]+2)/4
+;
+; void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WelsI4x4LumaPredDDR_mmx:
+ ;mov edx,[esp+4] ;pred
+ ;mov eax,[esp+8] ;pRef
+ ;mov ecx,[esp+12] ;stride
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movq mm1,[r1+r2-8] ;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
+ movq mm2,[r1-8] ;get value of 6 mm2[8] = 6
+ sub r1, r2 ;mov eax to above line of current block(postion of 1)
+ punpckhbw mm2,[r1-8] ;mm2[8](high 8th byte of mm2) = [0](value of 0), mm2[7]= [6]
+ movd mm3,[r1] ;get value 1, mm3[1] = [1],mm3[2]=[2],mm3[3]=[3]
+ punpckhwd mm1,mm2 ;mm1[8]=[0],mm1[7]=[6],mm1[6]=[11]
+ psllq mm3,18h ;mm3[5]=[1]
+ psrlq mm1,28h ;mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ por mm3,mm1 ;mm3[6]=[3],mm3[5]=[2],mm3[4]=[1],mm3[3]=[0],mm3[2]=[6],mm3[1]=[11]
+ movq mm1,mm3 ;mm1[6]=[3],mm1[5]=[2],mm1[4]=[1],mm1[3]=[0],mm1[2]=[6],mm1[1]=[11]
+ lea r1,[r1+r2*2-8h] ;set eax point to 12
+ movq mm4,[r1+r2] ;get value of 16, mm4[8]=[16]
+ psllq mm3,8 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[16]
+ por mm3,mm4 ;mm3[7]=[3],mm3[6]=[2],mm3[5]=[1],mm3[4]=[0],mm3[3]=[6],mm3[2]=[11],mm3[1]=[16]
+ movq mm2,mm3 ;mm2[7]=[3],mm2[6]=[2],mm2[5]=[1],mm2[4]=[0],mm2[3]=[6],mm2[2]=[11],mm2[1]=[16]
+ movq mm4,[r1+r2*2] ;mm4[8]=[21]
+ psllq mm3,8 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=0
+ psrlq mm4,38h ;mm4[1]=[21]
+ por mm3,mm4 ;mm3[8]=[3],mm3[7]=[2],mm3[6]=[1],mm3[5]=[0],mm3[4]=[6],mm3[3]=[11],mm3[2]=[16],mm3[1]=[21]
+ movq mm4,mm3 ;mm4[8]=[3],mm4[7]=[2],mm4[6]=[1],mm4[5]=[0],mm4[4]=[6],mm4[3]=[11],mm4[2]=[16],mm4[1]=[21]
+ pavgb mm3,mm1 ;mm3=([11]+[21]+1)/2
+ pxor mm1,mm4 ;find odd value in the lowest bit of each byte
+ pand mm1,[mmx_01bytes] ;set the odd bit
+ psubusb mm3,mm1 ;decrease 1 from odd bytes
+ pavgb mm2,mm3 ;mm2=(([11]+[21]+1)/2+1+[16])/2
+
+ movd [r0+12],mm2
+ psrlq mm2,8
+ movd [r0+8],mm2
+ psrlq mm2,8
+ movd [r0+4],mm2
+ psrlq mm2,8
+ movd [r0],mm2
+ WELSEMMS
+ ret
+
+ALIGN 16
+;***********************************************************************
+; 0 |1 |2 |3 |4 |
+; 5 |6 |7 |8 |9 |
+; 10|11|12|13|14|
+; 15|16|17|18|19|
+; 20|21|22|23|24|
+; 6 is the start pixel of current 4x4 block
+; pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
+;
+; void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;
+;***********************************************************************
+WelsI4x4LumaPredDc_sse2:
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movzx r4, byte [r1-1h]
+ sub r1, r2
+ movd xmm0, [r1]
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ xor r3, r3
+ movd r3d, xmm0
+ add r3, r4
+ movzx r4, byte [r1+r2*2-1h]
+ add r3, r4
+
+ lea r1, [r1+r2*2-1]
+ movzx r4, byte [r1+r2]
+ add r3, r4
+
+ movzx r4, byte [r1+r2*2]
+ add r3, r4
+ add r3, 4
+ sar r3, 3
+ imul r3, 0x01010101
+
+ movd xmm0, r3d
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ pop r4
+ pop r3
+ ret
+
+ALIGN 16
+;***********************************************************************
+; void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; copy 8 pixel of 8 line from left
+;***********************************************************************
+%macro MMX_PRED_H_8X8_ONE_LINE 4
+ movq %1, [%3-8]
+ psrlq %1, 38h
+
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
+%endmacro
+
+%macro MMX_PRED_H_8X8_ONE_LINEE 4
+ movq %1, [%3+r2-8]
+ psrlq %1, 38h
+
+ ;pmuludq %1, [mmx_01bytes] ;extend to 4 bytes
+ pmullw %1, [mmx_01bytes]
+ pshufw %1, %1, 0
+ movq [%4], %1
+%endmacro
+
+WELS_EXTERN WelsIChromaPredH_mmx
+WelsIChromaPredH_mmx:
+ ;mov edx, [esp+4] ;pred
+ ;mov eax, [esp+8] ;pRef
+ ;mov ecx, [esp+12] ;stride
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movq mm0, [r1-8]
+ psrlq mm0, 38h
+
+ ;pmuludq mm0, [mmx_01bytes] ;extend to 4 bytes
+ pmullw mm0, [mmx_01bytes]
+ pshufw mm0, mm0, 0
+ movq [r0], mm0
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+8
+
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+16
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+24
+
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+32
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+40
+
+ lea r1,[r1+r2*2]
+ MMX_PRED_H_8X8_ONE_LINE mm0, mm1, r1,r0+48
+
+ MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, r1,r0+56
+ WELSEMMS
+ ret
+
+ALIGN 16
+;***********************************************************************
+; void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; copy pixels from top 4 pixels
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredV_sse2
+WelsI4x4LumaPredV_sse2:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movd xmm0, [r1]
+ pshufd xmm0, xmm0, 0
+ movdqa [r0], xmm0
+ ret
+
+ALIGN 16
+;***********************************************************************
+; void __cdecl WelsIChromaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+; copy 8 pixels from top 8 pixels
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredV_sse2
+WelsIChromaPredV_sse2:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq xmm0, [r1]
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm1
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm0
+ movdqa [r0+32], xmm0
+ movdqa [r0+48], xmm0
+ ret
+
+ ALIGN 16
+;***********************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |a |b |
+; |g |h |e |f |
+; |i |j |g |h |
+
+; a = (1 + lt + l0)>>1
+; e = (1 + l0 + l1)>>1
+; g = (1 + l1 + l2)>>1
+; i = (1 + l2 + l3)>>1
+
+; d = (2 + t0 + (t1<<1) + t2)>>2
+; c = (2 + lt + (t0<<1) + t1)>>2
+; b = (2 + l0 + (lt<<1) + t0)>>2
+
+; f = (2 + l1 + (l0<<1) + lt)>>2
+; h = (2 + l2 + (l1<<1) + l0)>>2
+; j = (2 + l3 + (l2<<1) + l1)>>2
+; [b a f e h g j i] + [d c b a] --> mov to memory
+;
+; void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHD_mmx
+WelsI4x4LumaPredHD_mmx:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movd mm0, [r1-1] ; mm0 = [xx xx xx xx t2 t1 t0 lt]
+ psllq mm0, 20h ; mm0 = [t2 t1 t0 lt xx xx xx xx]
+
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movd mm2, [r1+2*r2-4]
+ punpcklbw mm2, [r1+r2-4] ; mm2[7] = l2, mm2[6] = l3
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
+ psrlq mm2, 20h
+ pxor mm0, mm2 ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
+
+ movq mm1, mm0
+ psrlq mm1, 10h ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
+ movq mm2, mm0
+ psrlq mm2, 8h ; mm2 = [xx t2 t1 t0 lt l0 l1 l2]
+ movq mm3, mm2
+ movq mm4, mm1
+ pavgb mm1, mm0
+
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm4 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm1 ; mm2 = [xx xx d c b f h j]
+
+ movq mm4, mm0
+ pavgb mm3, mm4 ; mm3 = [xx xx xx xx a e g i]
+ punpcklbw mm3, mm2 ; mm3 = [b a f e h g j i]
+
+ psrlq mm2, 20h
+ psllq mm2, 30h ; mm2 = [d c 0 0 0 0 0 0]
+ movq mm4, mm3
+ psrlq mm4, 10h ; mm4 = [0 0 b a f e h j]
+ pxor mm2, mm4 ; mm2 = [d c b a xx xx xx xx]
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx d c b a]
+
+ movd [r0], mm2
+ movd [r0+12], mm3
+ psrlq mm3, 10h
+ movd [r0+8], mm3
+ psrlq mm3, 10h
+ movd [r0+4], mm3
+ WELSEMMS
+ ret
+
+ALIGN 16
+;***********************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; t3 will never been used
+; destination:
+; |a |b |c |d |
+; |c |d |e |f |
+; |e |f |g |g |
+; |g |g |g |g |
+
+; a = (1 + l0 + l1)>>1
+; c = (1 + l1 + l2)>>1
+; e = (1 + l2 + l3)>>1
+; g = l3
+
+; b = (2 + l0 + (l1<<1) + l2)>>2
+; d = (2 + l1 + (l2<<1) + l3)>>2
+; f = (2 + l2 + (l3<<1) + l3)>>2
+
+; [g g f e d c b a] + [g g g g] --> mov to memory
+;
+; void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredHU_mmx
+WelsI4x4LumaPredHU_mmx:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ movd mm0, [r1-4] ; mm0[3] = l0
+ punpcklbw mm0, [r1+r2-4] ; mm0[7] = l1, mm0[6] = l0
+ lea r1, [r1+2*r2]
+ movd mm2, [r1-4] ; mm2[3] = l2
+ movd mm4, [r1+r2-4] ; mm4[3] = l3
+ punpcklbw mm2, mm4
+ punpckhwd mm0, mm2 ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
+
+ psrlq mm4, 18h
+ psllq mm4, 38h ; mm4 = [l3 xx xx xx xx xx xx xx]
+ psrlq mm0, 8h
+ pxor mm0, mm4 ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
+
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
+ movq mm3, mm1 ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
+ pavgb mm1, mm0 ; mm1 = [g e c a xx xx xx xx]
+
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [l2 l1 l0 xx xx xx xx xx]
+ movq mm5, mm2
+ pavgb mm2, mm0
+
+ pxor mm5, mm0 ; find odd value in the lowest bit of each byte
+ pand mm5, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm5 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm3 ; mm2 = [f d b xx xx xx xx xx]
+
+ psrlq mm2, 8h
+ pxor mm2, mm4 ; mm2 = [g f d b xx xx xx xx]
+
+ punpckhbw mm1, mm2 ; mm1 = [g g f e d c b a]
+ punpckhbw mm4, mm4 ; mm4 = [g g xx xx xx xx xx xx]
+ punpckhbw mm4, mm4 ; mm4 = [g g g g xx xx xx xx]
+
+ psrlq mm4, 20h
+ movd [r0+12], mm4
+
+ movd [r0], mm1
+ psrlq mm1, 10h
+ movd [r0+4], mm1
+ psrlq mm1, 10h
+ movd [r0+8], mm1
+ WELSEMMS
+ ret
+
+
+
+ALIGN 16
+;***********************************************************************
+; lt|t0|t1|t2|t3|
+; l0|
+; l1|
+; l2|
+; l3|
+; l3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |g |h |
+; |i |a |b |c |
+; |j |e |f |g |
+
+; a = (1 + lt + t0)>>1
+; b = (1 + t0 + t1)>>1
+; c = (1 + t1 + t2)>>1
+; d = (1 + t2 + t3)>>1
+
+; e = (2 + l0 + (lt<<1) + t0)>>2
+; f = (2 + lt + (t0<<1) + t1)>>2
+; g = (2 + t0 + (t1<<1) + t2)>>2
+
+; h = (2 + t1 + (t2<<1) + t3)>>2
+; i = (2 + lt + (l0<<1) + l1)>>2
+; j = (2 + l0 + (l1<<1) + l2)>>2
+;
+; void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVR_mmx
+WelsI4x4LumaPredVR_mmx:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1-1] ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
+ psllq mm0, 18h ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
+
+ movd mm1, [r1+2*r2-4]
+ punpcklbw mm1, [r1+r2-4] ; mm1[7] = l0, mm1[6] = l1
+ lea r1, [r1+2*r2]
+ movq mm2, [r1+r2-8] ; mm2[7] = l2
+ punpckhwd mm2, mm1 ; mm2 = [l0 l1 l2 xx xx xx xx xx]
+ psrlq mm2, 28h
+ pxor mm0, mm2 ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
+
+ movq mm1, mm0
+ psllq mm1, 8h ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm1, mm0 ; mm1 = [d c b a xx xx xx xx]
+
+ movq mm2, mm0
+ psllq mm2, 10h ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
+ movq mm3, mm2
+ pavgb mm2, mm0
+
+ pxor mm3, mm0 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm3 ; decrease 1 from odd bytes
+
+ movq mm3, mm0
+ psllq mm3, 8h ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
+ pavgb mm3, mm2 ; mm3 = [h g f e i j xx xx]
+ movq mm2, mm3
+
+ psrlq mm1, 20h ; mm1 = [xx xx xx xx d c b a]
+ movd [r0], mm1
+
+ psrlq mm2, 20h ; mm2 = [xx xx xx xx h g f e]
+ movd [r0+4], mm2
+
+ movq mm4, mm3
+ psllq mm4, 20h
+ psrlq mm4, 38h ; mm4 = [xx xx xx xx xx xx xx i]
+
+ movq mm5, mm3
+ psllq mm5, 28h
+ psrlq mm5, 38h ; mm5 = [xx xx xx xx xx xx xx j]
+
+ psllq mm1, 8h
+ pxor mm4, mm1 ; mm4 = [xx xx xx xx c b a i]
+ movd [r0+8], mm4
+
+ psllq mm2, 8h
+ pxor mm5, mm2 ; mm5 = [xx xx xx xx g f e j]
+ movd [r0+12], mm5
+ WELSEMMS
+ ret
+
+ALIGN 16
+;***********************************************************************
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
+; destination:
+; |a |b |c |d |
+; |b |c |d |e |
+; |c |d |e |f |
+; |d |e |f |g |
+
+; a = (2 + t0 + t2 + (t1<<1))>>2
+; b = (2 + t1 + t3 + (t2<<1))>>2
+; c = (2 + t2 + t4 + (t3<<1))>>2
+; d = (2 + t3 + t5 + (t4<<1))>>2
+
+; e = (2 + t4 + t6 + (t5<<1))>>2
+; f = (2 + t5 + t7 + (t6<<1))>>2
+; g = (2 + t6 + t7 + (t7<<1))>>2
+
+; [g f e d c b a] --> mov to memory
+;
+; void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredDDL_mmx
+WelsI4x4LumaPredDDL_mmx:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
+
+ movq mm3, mm0
+ psrlq mm3, 38h
+ psllq mm3, 38h ; mm3 = [t7 xx xx xx xx xx xx xx]
+
+ psllq mm1, 8h ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
+ psrlq mm2, 8h
+ pxor mm2, mm3 ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
+
+ movq mm3, mm1
+ pavgb mm1, mm2
+ pxor mm3, mm2 ; find odd value in the lowest bit of each byte
+ pand mm3, [mmx_01bytes] ; set the odd bit
+ psubusb mm1, mm3 ; decrease 1 from odd bytes
+
+ pavgb mm0, mm1 ; mm0 = [g f e d c b a xx]
+
+ psrlq mm0, 8h
+ movd [r0], mm0
+ psrlq mm0, 8h
+ movd [r0+4], mm0
+ psrlq mm0, 8h
+ movd [r0+8], mm0
+ psrlq mm0, 8h
+ movd [r0+12], mm0
+ WELSEMMS
+ ret
+
+
+ALIGN 16
+;***********************************************************************
+; lt|t0|t1|t2|t3|t4|t5|t6|t7
+; l0|
+; l1|
+; l2|
+; l3|
+; lt,t0,t1,t2,t3 will never been used
+; destination:
+; |a |b |c |d |
+; |e |f |g |h |
+; |b |c |d |i |
+; |f |g |h |j |
+
+; a = (1 + t0 + t1)>>1
+; b = (1 + t1 + t2)>>1
+; c = (1 + t2 + t3)>>1
+; d = (1 + t3 + t4)>>1
+; i = (1 + t4 + t5)>>1
+
+; e = (2 + t0 + (t1<<1) + t2)>>2
+; f = (2 + t1 + (t2<<1) + t3)>>2
+; g = (2 + t2 + (t3<<1) + t4)>>2
+; h = (2 + t3 + (t4<<1) + t5)>>2
+; j = (2 + t4 + (t5<<1) + t6)>>2
+
+; [i d c b a] + [j h g f e] --> mov to memory
+;
+; void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI4x4LumaPredVL_mmx
+WelsI4x4LumaPredVL_mmx:
+ %assign push_num 0
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1] ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
+ movq mm1, mm0
+ movq mm2, mm0
+
+ psrlq mm1, 8h ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
+ psrlq mm2, 10h ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
+
+ movq mm3, mm1
+ pavgb mm3, mm0 ; mm3 = [xx xx xx i d c b a]
+
+ movq mm4, mm2
+ pavgb mm2, mm0
+ pxor mm4, mm0 ; find odd value in the lowest bit of each byte
+ pand mm4, [mmx_01bytes] ; set the odd bit
+ psubusb mm2, mm4 ; decrease 1 from odd bytes
+
+ pavgb mm2, mm1 ; mm2 = [xx xx xx j h g f e]
+
+ movd [r0], mm3
+ psrlq mm3, 8h
+ movd [r0+8], mm3
+
+ movd [r0+4], mm2
+ psrlq mm2, 8h
+ movd [r0+12], mm2
+ WELSEMMS
+ ret
+
+ALIGN 16
+;***********************************************************************
+;
+; void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsIChromaPredDc_sse2
+WelsIChromaPredDc_sse2:
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movq mm0, [r1]
+
+ movzx r3, byte [r1+r2-0x01] ; l1
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l2
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l3
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l4
+ add r3, r4
+ movd mm1, r3d ; mm1 = l1+l2+l3+l4
+
+ movzx r3, byte [r1+r2-0x01] ; l5
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l6
+ add r3, r4
+ movzx r4, byte [r1+r2-0x01] ; l7
+ add r3, r4
+ lea r1, [r1+2*r2]
+ movzx r4, byte [r1-0x01] ; l8
+ add r3, r4
+ movd mm2, r3d ; mm2 = l5+l6+l7+l8
+
+ movq mm3, mm0
+ psrlq mm0, 0x20
+ psllq mm3, 0x20
+ psrlq mm3, 0x20
+ pxor mm4, mm4
+ psadbw mm0, mm4
+ psadbw mm3, mm4 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
+ paddq mm3, mm1
+ movq mm1, mm2
+ paddq mm1, mm0; ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
+
+ movq mm4, [mmx_0x02]
+
+ paddq mm0, mm4
+ psrlq mm0, 0x02
+
+ paddq mm2, mm4
+ psrlq mm2, 0x02
+
+ paddq mm3, mm4
+ paddq mm3, mm4
+ psrlq mm3, 0x03
+
+ paddq mm1, mm4
+ paddq mm1, mm4
+ psrlq mm1, 0x03
+
+ pmuludq mm0, [mmx_01bytes]
+ pmuludq mm3, [mmx_01bytes]
+ psllq mm0, 0x20
+ pxor mm0, mm3 ; mm0 = m_up
+
+ pmuludq mm2, [mmx_01bytes]
+ pmuludq mm1, [mmx_01bytes]
+ psllq mm1, 0x20
+ pxor mm1, mm2 ; mm2 = m_down
+
+ movq [r0], mm0
+ movq [r0+0x08], mm0
+ movq [r0+0x10], mm0
+ movq [r0+0x18], mm0
+
+ movq [r0+0x20], mm1
+ movq [r0+0x28], mm1
+ movq [r0+0x30], mm1
+ movq [r0+0x38], mm1
+
+ pop r4
+ pop r3
+ WELSEMMS
+ ret
+
+
+
+ALIGN 16
+;***********************************************************************
+;
+; void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredDc_sse2
+WelsI16x16LumaPredDc_sse2:
+ push r3
+ push r4
+ %assign push_num 2
+ LOAD_3_PARA
+ %ifndef X86_32
+ movsx r2, r2d
+ %endif
+ sub r1, r2
+ movdqa xmm0, [r1] ; read one row
+ pxor xmm1, xmm1
+ psadbw xmm0, xmm1
+ movdqa xmm1, xmm0
+ psrldq xmm1, 0x08
+ pslldq xmm0, 0x08
+ psrldq xmm0, 0x08
+ paddw xmm0, xmm1
+
+ movzx r3, byte [r1+r2-0x01]
+ movzx r4, byte [r1+2*r2-0x01]
+ add r3, r4
+ lea r1, [r1+r2]
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ LOAD_2_LEFT_AND_ADD
+ add r3, 0x10
+ movd xmm1, r3d
+ paddw xmm0, xmm1
+ psrld xmm0, 0x05
+ pmuludq xmm0, [mmx_01bytes]
+ pshufd xmm0, xmm0, 0
+
+ movdqa [r0], xmm0
+ movdqa [r0+0x10], xmm0
+ movdqa [r0+0x20], xmm0
+ movdqa [r0+0x30], xmm0
+ movdqa [r0+0x40], xmm0
+ movdqa [r0+0x50], xmm0
+ movdqa [r0+0x60], xmm0
+ movdqa [r0+0x70], xmm0
+ movdqa [r0+0x80], xmm0
+ movdqa [r0+0x90], xmm0
+ movdqa [r0+0xa0], xmm0
+ movdqa [r0+0xb0], xmm0
+ movdqa [r0+0xc0], xmm0
+ movdqa [r0+0xd0], xmm0
+ movdqa [r0+0xe0], xmm0
+ movdqa [r0+0xf0], xmm0
+
+ pop r4
+ pop r3
+ ret
+
+;***********************************************************************
+;
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
+; uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
+;
+;***********************************************************************
+%ifdef X86_ASM
+WELS_EXTERN WelsSmpleSatdThree4x4_sse2
+align 16
+WelsSmpleSatdThree4x4_sse2:
+ push ebx
+ push esi
+ push edi
+ mov eax, [esp+24];p_enc
+ mov ebx, [esp+28];linesize_enc
+
+ ; load source 4x4 samples and Hadamard transform
+ movd xmm0, [eax]
+ movd xmm1, [eax+ebx]
+ lea eax , [eax+2*ebx]
+ movd xmm2, [eax]
+ movd xmm3, [eax+ebx]
+ punpckldq xmm0, xmm2
+ punpckldq xmm1, xmm3
+
+ pxor xmm6, xmm6
+ punpcklbw xmm0, xmm6
+ punpcklbw xmm1, xmm6
+
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+ SSE2_XSawp qdq, xmm0, xmm2, xmm3
+
+ movdqa xmm4, xmm0
+ paddw xmm0, xmm3
+ psubw xmm4, xmm3
+
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm4
+ punpckhwd xmm4, xmm2
+
+ SSE2_XSawp dq, xmm0, xmm4, xmm3
+ SSE2_XSawp qdq, xmm0, xmm3, xmm5
+
+ movdqa xmm7, xmm0
+ paddw xmm0, xmm5
+ psubw xmm7, xmm5
+
+ SSE2_XSawp qdq, xmm0, xmm7, xmm1
+
+ ; Hadamard transform results are saved in xmm0 and xmm2
+ movdqa xmm2, xmm0
+ paddw xmm0, xmm1
+ psubw xmm2, xmm1
+
+ ; load top boundary samples: [a b c d]
+ mov eax, [esp+16];p_dec
+ sub eax, [esp+20];linesize_dec
+ movzx ecx, byte [eax]
+ movzx edx, byte [eax+1]
+ movzx esi, byte [eax+2]
+ movzx edi, byte [eax+3]
+
+ ; get the transform results of top boundary samples: [a b c d]
+ add edx, ecx ; edx = a + b
+ add edi, esi ; edi = c + d
+ add ecx, ecx ; ecx = a + a
+ add esi, esi ; esi = c + c
+ sub ecx, edx ; ecx = a + a - a - b = a - b
+ sub esi, edi ; esi = c + c - c - d = c - d
+ add edi, edx ; edi = (a + b) + (c + d)
+ add edx, edx
+ sub edx, edi ; edx = (a + b) - (c + d)
+ add esi, ecx ; esi = (a - b) + (c - d)
+ add ecx, ecx
+ sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
+
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm2
+ movd xmm5, edi ; store the edi for DC mode
+ pxor xmm3, xmm3
+ pxor xmm4, xmm4
+ pinsrw xmm3, edi, 0
+ pinsrw xmm3, esi, 4
+ psllw xmm3, 2
+ pinsrw xmm4, edx, 0
+ pinsrw xmm4, ecx, 4
+ psllw xmm4, 2
+
+ ; get the satd of H
+ psubw xmm0, xmm3
+ psubw xmm2, xmm4
+
+ WELS_AbsW xmm0, xmm1
+ WELS_AbsW xmm2, xmm1
+ paddusw xmm0, xmm2
+ SUMW_HORIZON1 xmm0, xmm1 ; satd of V is stored in xmm0
+
+ ; load left boundary samples: [a b c d]'
+ mov eax, [esp+16]
+ mov ebx, [esp+20]
+ movzx ecx, byte [eax-1]
+ movzx edx, byte [eax+ebx-1]
+ lea eax , [eax+2*ebx]
+ movzx esi, byte [eax-1]
+ movzx edi, byte [eax+ebx-1]
+
+ ; get the transform results of left boundary samples: [a b c d]'
+ add edx, ecx ; edx = a + b
+ add edi, esi ; edi = c + d
+ add ecx, ecx ; ecx = a + a
+ add esi, esi ; esi = c + c
+ sub ecx, edx ; ecx = a + a - a - b = a - b
+ sub esi, edi ; esi = c + c - c - d = c - d
+ add edi, edx ; edi = (a + b) + (c + d)
+ add edx, edx
+ sub edx, edi ; edx = (a + b) - (c + d)
+ add esi, ecx ; esi = (a - b) + (c - d)
+ add ecx, ecx
+ sub ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
+
+ ; store the transform results in xmm3
+ movd xmm3, edi
+ pinsrw xmm3, edx, 1
+ pinsrw xmm3, ecx, 2
+ pinsrw xmm3, esi, 3
+ psllw xmm3, 2
+
+ ; get the satd of V
+ movdqa xmm2, xmm6
+ movdqa xmm4, xmm7
+ psubw xmm2, xmm3
+ WELS_AbsW xmm2, xmm1
+ WELS_AbsW xmm4, xmm1
+ paddusw xmm2, xmm4
+ SUMW_HORIZON1 xmm2, xmm1 ; satd of H is stored in xmm2
+
+ ; DC result is stored in xmm1
+ add edi, 4
+ movd xmm1, edi
+ paddw xmm1, xmm5
+ psrlw xmm1, 3
+ movdqa xmm5, xmm1
+ psllw xmm1, 4
+
+ ; get the satd of DC
+ psubw xmm6, xmm1
+ WELS_AbsW xmm6, xmm1
+ WELS_AbsW xmm7, xmm1
+ paddusw xmm6, xmm7
+ SUMW_HORIZON1 xmm6, xmm1 ; satd of DC is stored in xmm6
+
+ ; comparing order: DC H V
+ mov edx, [esp+32]
+ movd eax, xmm6
+ movd edi, xmm2
+ movd esi, xmm0
+ and eax, 0xffff
+ shr eax, 1
+ and edi, 0xffff
+ shr edi, 1
+ and esi, 0xffff
+ shr esi, 1
+ add eax, [esp+40]
+ add edi, [esp+44]
+ add esi, [esp+48]
+ cmp ax, di
+ jg near not_dc
+ cmp ax, si
+ jg near not_dc_h
+
+ ; for DC mode
+ movd ebx, xmm5
+ imul ebx, 0x01010101
+ movd xmm5, ebx
+ pshufd xmm5, xmm5, 0
+ movdqa [edx], xmm5
+ mov ebx, [esp+36]
+ mov dword [ebx], 0x02
+ pop edi
+ pop esi
+ pop ebx
+ ret
+
+not_dc:
+ cmp di, si
+ jg near not_dc_h
+
+ ; for H mode
+ SSE_DB_1_2REG xmm6, xmm7
+ mov eax, [esp+16]
+ mov ebx, [esp+20]
+ movzx ecx, byte [eax-1]
+ movd xmm0, ecx
+ pmuludq xmm0, xmm6
+
+ movzx ecx, byte [eax+ebx-1]
+ movd xmm1, ecx
+ pmuludq xmm1, xmm6
+%if 1
+ punpckldq xmm0, xmm1
+%else
+ unpcklps xmm0, xmm1
+%endif
+ lea eax, [eax+ebx*2]
+ movzx ecx, byte [eax-1]
+ movd xmm2, ecx
+ pmuludq xmm2, xmm6
+
+ movzx ecx, byte [eax+ebx-1]
+ movd xmm3, ecx
+ pmuludq xmm3, xmm6
+%if 1
+ punpckldq xmm2, xmm3
+ punpcklqdq xmm0, xmm2
+%else
+ unpcklps xmm2, xmm3
+ unpcklpd xmm0, xmm2
+%endif
+ movdqa [edx],xmm0
+
+ mov eax, edi
+ mov ebx, [esp+36]
+ mov dword [ebx], 0x01
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+not_dc_h:
+ ; for V mode
+ mov eax, [esp+16]
+ sub eax, [esp+20]
+ movd xmm0, [eax]
+ pshufd xmm0, xmm0, 0
+ movdqa [edx],xmm0
+
+ mov eax, esi
+ mov ebx, [esp+36]
+ mov dword [ebx], 0x00
+
+ pop edi
+ pop esi
+ pop ebx
+ ret
+%endif
+
--- /dev/null
+++ b/codec/processing/src/asm/sad.asm
@@ -1,0 +1,220 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* sad.asm
+;*
+;* Abstract
+;* WelsSampleSad8x8_sse21
+;*
+;* History
+;* 8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and %1, 0x1f|(%3>>1)
+cmp %1, (32-%2)|(%3>>1)
+%endmacro
+
+%macro SSE2_GetSad8x4 0
+ movq xmm0, [r0]
+ movq xmm1, [r0+r1]
+ lea r0, [r0+2*r1]
+ movhps xmm0, [r0]
+ movhps xmm1, [r0+r1]
+
+ movq xmm2, [r2]
+ movq xmm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movhps xmm2, [r2]
+ movhps xmm3, [r2+r3]
+ psadbw xmm0, xmm2
+ psadbw xmm1, xmm3
+ paddw xmm6, xmm0
+ paddw xmm6, xmm1
+%endmacro
+
+
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+
+WELS_EXTERN WelsSampleSad8x8_sse21
+WelsSampleSad8x8_sse21:
+ ;mov ecx, [esp+12]
+ ;mov edx, ecx
+ ;CACHE_SPLIT_CHECK edx, 8, 64
+ ;jle near .pixel_sad_8x8_nsplit
+ ;push ebx
+ ;push edi
+ ;mov eax, [esp+12]
+ ;mov ebx, [esp+16]
+
+ %assign push_num 0
+ mov r2, arg3
+ push r2
+ CACHE_SPLIT_CHECK r2, 8, 64
+ jle near .pixel_sad_8x8_nsplit
+ pop r2
+%ifdef X86_32
+ push r3
+ push r4
+ push r5
+%endif
+ %assign push_num 3
+ mov r0, arg1
+ mov r1, arg2
+ SIGN_EXTENTION r1, r1d
+ pxor xmm7, xmm7
+
+ ;ecx r2, edx r4, edi r5
+
+ mov r5, r2
+ and r5, 0x07
+ sub r2, r5
+ mov r4, 8
+ sub r4, r5
+
+ shl r5, 3
+ shl r4, 3
+ movd xmm5, r5d
+ movd xmm6, r4d
+ mov r5, 8
+ add r5, r2
+ mov r3, arg4
+ SIGN_EXTENTION r3, r3d
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ lea r5, [r5+2*r3]
+
+ movq xmm0, [r0]
+ movhps xmm0, [r0+r1]
+
+ movq xmm1, [r2]
+ movq xmm2, [r5]
+ movhps xmm1, [r2+r3]
+ movhps xmm2, [r5+r3]
+ psrlq xmm1, xmm5
+ psllq xmm2, xmm6
+ por xmm1, xmm2
+
+ psadbw xmm0, xmm1
+ paddw xmm7, xmm0
+
+ movhlps xmm0, xmm7
+ paddw xmm0, xmm7
+ movd retrd, xmm0
+%ifdef X86_32
+ pop r5
+ pop r4
+ pop r3
+%endif
+ jmp .return
+
+.pixel_sad_8x8_nsplit:
+ ;push ebx
+ ;mov eax, [esp+8]
+ ;mov ebx, [esp+12]
+ ;mov edx, [esp+20]
+
+ pop r2
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENTION r1, r1d
+ SIGN_EXTENTION r3, r3d
+ pxor xmm6, xmm6
+ SSE2_GetSad8x4
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ SSE2_GetSad8x4
+ movhlps xmm0, xmm6
+ paddw xmm0, xmm6
+ movd retrd, xmm0
+ LOAD_4_PARA_POP
+.return:
+ ret
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/src/asm/vaa.asm
@@ -1,0 +1,1414 @@
+;*!
+;* \copy
+;* Copyright (c) 2010-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;* vaa.asm
+;*
+;* Abstract
+;* sse2 for pVaa routines
+;*
+;* History
+;* 04/14/2010 Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+%ifdef X86_32
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
+; movdqa %1, %2
+; punpcklbw %1, %3
+; punpckhbw %2, %3
+; paddw %1, %2
+; pmaddwd %1, %4
+; pshufd %2, %1, 04Eh ; 01001110 B
+; paddd %1, %2
+; pshufd %2, %1, 0B1h ; 10110001 B
+; paddd %1, %2
+;%endmacro ; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
+%endmacro ; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [esi ] ; line 0
+ movdqa %2, [esi+ecx] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [esi+ebx] ; line 2
+ movdqa %4, [esi+edx] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ pshufd %3, %1, 0B1h
+ pshufd %4, %2, 0B1h
+ paddw %1, %3
+ paddw %2, %4
+ movdqa %3, %1
+ movdqa %4, %2
+ pshuflw %5, %1, 0B1h
+ pshufhw %6, %3, 0B1h
+ paddw %1, %5
+ paddw %3, %6
+ pshuflw %5, %2, 0B1h
+ pshufhw %6, %4, 0B1h
+ paddw %2, %5
+ paddw %4, %6
+ punpcklwd %1, %2
+ punpckhwd %3, %4
+ punpcklwd %1, %3
+ psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+ movdqa %1, [esi ] ; line 0
+ movdqa %2, [esi+ecx] ; line 1
+ movdqa %3, %1
+ punpcklbw %1, xmm7
+ punpckhbw %3, xmm7
+ movdqa %4, %2
+ punpcklbw %4, xmm7
+ punpckhbw %2, xmm7
+ paddw %1, %4
+ paddw %2, %3
+ movdqa %3, [esi+ebx] ; line 2
+ movdqa %4, [esi+edx] ; line 3
+ movdqa %5, %3
+ punpcklbw %3, xmm7
+ punpckhbw %5, xmm7
+ movdqa %6, %4
+ punpcklbw %6, xmm7
+ punpckhbw %4, xmm7
+ paddw %3, %6
+ paddw %4, %5
+ paddw %1, %3 ; block 0, 1
+ paddw %2, %4 ; block 2, 3
+ phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+ phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+ psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, [esi+ebx]
+ movdqa xmm4, [edi+ebx]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea esi, [esi+ebx*2]
+ lea edi, [edi+ebx*2]
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
+
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
+
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
+
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
+
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+%macro WELS_SAD_SD_MAD_16x1_SSE2 4
+%define sad_reg %1
+%define sum_cur_reg %2
+%define sum_ref_reg %3
+%define mad_reg %4
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+
+%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
+%define max_reg %1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
+%endmacro
+
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
+%define sad_reg %1
+%define sum_reg %2
+%define mad_reg %3
+%define sqdiff_reg %4
+ movdqa xmm1, [esi]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
+
+ movdqa xmm2, [edi]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
+
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
+
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
+
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
+
+ add esi, ebx
+ add edi, ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+; dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+ push esi
+ push edi
+ push ebp
+ push ebx
+ push edx
+
+ mov esi, [esp+24]
+ mov edi, [esp+28]
+ mov ebx, [esp+32]
+ mov ecx, [esp+36]
+ mov edx, [esp+40]
+ pxor xmm0, xmm0
+.hloop:
+ mov eax, ebx
+ mov ebp, $0
+.wloop:
+ movdqa xmm1, [esi+ebp]
+ movdqa xmm2, [edi+ebp]
+ psadbw xmm1, xmm2
+ pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ add ebp, 010h
+ dec eax
+ jnz near .wloop
+ lea esi, [esi+edx]
+ lea edi, [edi+edx]
+ dec ecx
+ jnz near .hloop
+
+ movd eax, xmm0
+ pop edx
+ pop ebx
+ pop ebp
+ pop edi
+ pop esi
+ ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:
+ push esi
+ push edi
+ push ebx
+
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
+
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
+
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
+
+.hloops:
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
+
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
+
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
+
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
+
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
+
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
+
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
+
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
+
+ ret
+
+; , 6/7/2010
+
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
+; int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define ref_orig esp + pushsize + 4
+%define cur_orig esp + pushsize + 8
+%define iPicStride esp + pushsize + 12
+%define gom_pixel_num esp + pushsize + 16
+%define pSum esp + pushsize + 20
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [ref_orig]
+ mov edi, [cur_orig]
+ mov ebx, [iPicStride]
+ mov eax, [gom_pixel_num]
+ mov ecx, 16 ;MB_WIDTH_LUMA
+ pxor xmm0, xmm0
+mb_width_loop_p:
+ mov edx, esi
+ add edx, eax ; end address
+gom_row_loop_p:
+ movdqa xmm1, [esi]
+ movdqa xmm2, [edi]
+ psadbw xmm1, xmm2
+ paddd xmm0, xmm1
+ add esi, 16
+ add edi, 16
+ cmp esi, edx
+ jl gom_row_loop_p
+
+ sub esi, eax
+ sub edi, eax
+ add esi, ebx
+ add edi, ebx
+ loop mb_width_loop_p
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddd xmm1, xmm0
+ movd eax, xmm1
+ mov edx, [pSum] ; pSum
+ add [edx], eax
+
+%undef ref_orig
+%undef cur_orig
+%undef iPicStride
+%undef gom_pixel_num
+%undef pSum
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
+; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define cur_orig esp + pushsize + 4
+%define iPicStride esp + pushsize + 8
+%define gom_pixel_num esp + pushsize + 12
+%define pSum esp + pushsize + 16
+%define pSqrSum esp + pushsize + 20
+%define pushsize 8
+ push esi
+ push ebx
+ mov esi, [cur_orig]
+ mov eax, [gom_pixel_num]
+ mov ebx, [iPicStride]
+ mov ecx, 16 ;MB_WIDTH_LUMA
+ pxor xmm0, xmm0 ; zero
+ pxor xmm1, xmm1 ; sum
+ pxor xmm2, xmm2 ; sqr sum
+mb_width_loop_i:
+ mov edx, esi
+ add edx, eax ; end address
+gom_row_loop_i:
+ movdqa xmm3, [esi]
+ movdqa xmm4, xmm3
+ psadbw xmm4, xmm0
+ paddd xmm1, xmm4
+ movdqa xmm4, xmm3
+ punpcklbw xmm4, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm4, xmm4
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ paddd xmm2, xmm4
+ add esi, 16
+ cmp esi, edx
+ jl gom_row_loop_i
+
+ sub esi, eax
+ add esi, ebx
+ loop mb_width_loop_i
+
+ movdqa xmm3, xmm1
+ psrldq xmm3, 8
+ paddd xmm1, xmm3
+ movd eax, xmm1
+ mov edx, [pSum]
+ add [edx], eax
+
+ movdqa xmm3, xmm2
+ psrldq xmm3, 8
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psrldq xmm3, 4
+ paddd xmm2, xmm3
+ movd eax, xmm2
+ mov edx, [pSqrSum]
+ add [edx], eax
+
+
+%undef cur_orig
+%undef iPicStride
+%undef gom_pixel_num
+%undef pSum
+%undef pSqrSum
+%undef pushsize
+ pop ebx
+ pop esi
+ ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define cur_data esp + pushsize + 4
+%define ref_data esp + pushsize + 8
+%define iPicWidth esp + pushsize + 12
+%define iPicHeight esp + pushsize + 16
+%define iPicStride esp + pushsize + 20
+%define psadframe esp + pushsize + 24
+%define psad8x8 esp + pushsize + 28
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+height_loop:
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
+width_loop:
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ WELS_SAD_16x2_SSE2
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz width_loop
+
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define localsize 8
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+var_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+var_width_loop:
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_16x1_SSE2
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz var_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz var_height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_width_loop:
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
+
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
+
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz sqdiff_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
+
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define p_sd8x8 esp + pushsize + localsize + 32
+%define p_mad8x8 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_ecx esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
+bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
+
+
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
+
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
+
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
+
+ mov edx, [psadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define localsize 16
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define p_sd8x8 esp + pushsize + localsize + 44
+%define p_mad8x8 esp + pushsize + localsize + 48
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define tmp_ecx esp + 12
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
+
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
+
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+%endif
--- /dev/null
+++ b/codec/processing/src/backgounddetection/BackgroundDetection.cpp
@@ -1,0 +1,389 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "BackgroundDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define LOG2_BGD_OU_SIZE (4)
+#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
+#define BGD_OU_SIZE (1<<LOG2_BGD_OU_SIZE)
+#define BGD_OU_SIZE_UV (BGD_OU_SIZE>>1)
+#define BGD_THD_SAD (2*BGD_OU_SIZE*BGD_OU_SIZE)
+#define BGD_THD_ASD_UV (4*BGD_OU_SIZE_UV)
+#define LOG2_MB_SIZE (4)
+#define OU_SIZE_IN_MB (BGD_OU_SIZE >> 4)
+#define Q_FACTOR (8)
+#define BGD_DELTA_QP_THD (3)
+
+#define OU_LEFT (0x01)
+#define OU_RIGHT (0x02)
+#define OU_TOP (0x04)
+#define OU_BOTTOM (0x08)
+
+CBackgroundDetection::CBackgroundDetection (int32_t iCpuFlag) {
+ m_eMethod = METHOD_BACKGROUND_DETECTION;
+ WelsMemset (&m_BgdParam, 0, sizeof (m_BgdParam));
+ m_iLargestFrameSize = 0;
+}
+
+CBackgroundDetection::~CBackgroundDetection() {
+ FreeOUArrayMemory();
+}
+
+EResult CBackgroundDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ EResult eReturn = RET_INVALIDPARAM;
+
+ if (pSrcPixMap == NULL || pRefPixMap == NULL)
+ return eReturn;
+
+ m_BgdParam.pCur[0] = (uint8_t*)pSrcPixMap->pPixel[0];
+ m_BgdParam.pCur[1] = (uint8_t*)pSrcPixMap->pPixel[1];
+ m_BgdParam.pCur[2] = (uint8_t*)pSrcPixMap->pPixel[2];
+ m_BgdParam.pRef[0] = (uint8_t*)pRefPixMap->pPixel[0];
+ m_BgdParam.pRef[1] = (uint8_t*)pRefPixMap->pPixel[1];
+ m_BgdParam.pRef[2] = (uint8_t*)pRefPixMap->pPixel[2];
+ m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
+ m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
+ m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
+ m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
+ m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
+
+ int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
+ if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize) {
+ FreeOUArrayMemory();
+ m_BgdParam.pOU_array = AllocateOUArrayMemory (m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
+ m_iLargestFrameSize = iCurFrameSize;
+ }
+
+ if (m_BgdParam.pOU_array == NULL)
+ return eReturn;
+
+ BackgroundDetection (&m_BgdParam);
+
+ return RET_SUCCESS;
+}
+
+EResult CBackgroundDetection::Set (int32_t iType, void* pParam) {
+ if (pParam == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ SBGDInterface* pInterface = (SBGDInterface*)pParam;
+
+ m_BgdParam.pBackgroundMbFlag = (int8_t*)pInterface->pBackgroundMbFlag;
+ m_BgdParam.pCalcRes = pInterface->pCalcRes;
+
+ return RET_SUCCESS;
+}
+
+inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight) {
+ int32_t iMaxOUWidth = (BGD_OU_SIZE - 1 + iWidth) >> LOG2_BGD_OU_SIZE;
+ int32_t iMaxOUHeight = (BGD_OU_SIZE - 1 + iHeight) >> LOG2_BGD_OU_SIZE;
+ return (SBackgroundOU*)WelsMalloc (iMaxOUWidth * iMaxOUHeight * sizeof (SBackgroundOU));
+}
+
+inline void CBackgroundDetection::FreeOUArrayMemory() {
+ _SafeFree (m_BgdParam.pOU_array);
+}
+
+void CBackgroundDetection::GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
+ SBackgroundOU* pBgdOU) {
+ int32_t iSubSD[4];
+ uint8_t iSubMAD[4];
+ int32_t iSubSAD[4];
+
+ uint8_t (*pMad8x8)[4];
+ int32_t (*pSad8x8)[4];
+ int32_t (*pSd8x8)[4];
+
+ pSad8x8 = sVaaCalcInfo->pSad8x8;
+ pMad8x8 = sVaaCalcInfo->pMad8x8;
+ pSd8x8 = sVaaCalcInfo->pSumOfDiff8x8;
+
+ iSubSAD[0] = pSad8x8[iMbIndex][0];
+ iSubSAD[1] = pSad8x8[iMbIndex][1];
+ iSubSAD[2] = pSad8x8[iMbIndex][2];
+ iSubSAD[3] = pSad8x8[iMbIndex][3];
+
+ iSubSD[0] = pSd8x8[iMbIndex][0];
+ iSubSD[1] = pSd8x8[iMbIndex][1];
+ iSubSD[2] = pSd8x8[iMbIndex][2];
+ iSubSD[3] = pSd8x8[iMbIndex][3];
+
+ iSubMAD[0] = pMad8x8[iMbIndex][0];
+ iSubMAD[1] = pMad8x8[iMbIndex][1];
+ iSubMAD[2] = pMad8x8[iMbIndex][2];
+ iSubMAD[3] = pMad8x8[iMbIndex][3];
+
+ pBgdOU->iSD = iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
+ pBgdOU->iSAD = iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
+ pBgdOU->iSD = WELS_ABS (pBgdOU->iSD);
+
+ // get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
+ pBgdOU->iMAD = WELS_MAX (WELS_MAX (iSubMAD[0], iSubMAD[1]), WELS_MAX (iSubMAD[2], iSubMAD[3]));
+ pBgdOU->iMinSubMad = WELS_MIN (WELS_MIN (iSubMAD[0], iSubMAD[1]), WELS_MIN (iSubMAD[2], iSubMAD[3]));
+
+ // get difference between the max and min SD of the SDs of sub-blocks of OU
+ pBgdOU->iMaxDiffSubSd = WELS_MAX (WELS_MAX (iSubSD[0], iSubSD[1]), WELS_MAX (iSubSD[2], iSubSD[3])) -
+ WELS_MIN (WELS_MIN (iSubSD[0], iSubSD[1]), WELS_MIN (iSubSD[2], iSubSD[3]));
+}
+
+void CBackgroundDetection::ForegroundBackgroundDivision (vBGDParam* pBgdParam) {
+ int32_t iPicWidthInOU = pBgdParam->iBgdWidth >> LOG2_BGD_OU_SIZE;
+ int32_t iPicHeightInOU = pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+ int32_t iPicWidthInMb = (15 + pBgdParam->iBgdWidth) >> 4;
+
+ SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
+
+ for (int32_t j = 0; j < iPicHeightInOU; j ++) {
+ for (int32_t i = 0; i < iPicWidthInOU; i++) {
+ GetOUParameters (pBgdParam->pCalcRes, (j * iPicWidthInMb + i) << (LOG2_BGD_OU_SIZE - LOG2_MB_SIZE), iPicWidthInMb,
+ pBackgroundOU);
+
+ pBackgroundOU->iBackgroundFlag = 0;
+ if (pBackgroundOU->iMAD > 63) {
+ pBackgroundOU++;
+ continue;
+ }
+ if ((pBackgroundOU->iMaxDiffSubSd <= pBackgroundOU->iSAD >> 3
+ || pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR))
+ && pBackgroundOU->iSAD < (BGD_THD_SAD << 1)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
+ if (pBackgroundOU->iSAD <= BGD_OU_SIZE * Q_FACTOR) {
+ pBackgroundOU->iBackgroundFlag = 1;
+ } else {
+ pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
+ (pBackgroundOU->iSD < (pBackgroundOU->iSAD * 3) >> 2) :
+ (pBackgroundOU->iSD << 1 < pBackgroundOU->iSAD);
+ }
+ }
+ pBackgroundOU++;
+ }
+ }
+}
+inline int32_t CBackgroundDetection::CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride) {
+ int32_t ASD = 0;
+ int32_t idx;
+ for (idx = 0; idx < BGD_OU_SIZE_UV; idx++) {
+ ASD += *pOriCur - *pOriRef;
+ pOriRef += iStride;
+ pOriCur += iStride;
+ }
+ return WELS_ABS (ASD);
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
+ SBackgroundOU* pOUNeighbours[]) {
+ SBackgroundOU* pOU_L = pOUNeighbours[0];
+ SBackgroundOU* pOU_R = pOUNeighbours[1];
+ SBackgroundOU* pOU_U = pOUNeighbours[2];
+ SBackgroundOU* pOU_D = pOUNeighbours[3];
+
+ if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad << 1) {
+ int32_t iMaxNbrForegroundMad;
+ int32_t iMaxNbrBackgroundMad;
+ int32_t aBackgroundMad[4];
+ int32_t aForegroundMad[4];
+
+ aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
+ aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
+ aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
+ aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
+ iMaxNbrForegroundMad = WELS_MAX (WELS_MAX (aForegroundMad[0], aForegroundMad[1]), WELS_MAX (aForegroundMad[2],
+ aForegroundMad[3]));
+
+ aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
+ aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
+ aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
+ aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
+ iMaxNbrBackgroundMad = WELS_MAX (WELS_MAX (aBackgroundMad[0], aBackgroundMad[1]), WELS_MAX (aBackgroundMad[2],
+ aBackgroundMad[3]));
+
+ return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad << 2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad << 1
+ && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad * 3) >> 1));
+ }
+ return 0;
+}
+
+inline bool_t CBackgroundDetection::ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags,
+ int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam* pBgdParam) {
+ static const int8_t kaOUPos[4] = {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
+ int32_t aEdgeOffset[4] = {0, BGD_OU_SIZE_UV - 1, 0, iPicStrideUV* (BGD_OU_SIZE_UV - 1)};
+ int32_t iStride[4] = {iPicStrideUV, iPicStrideUV, 1, 1};
+
+ // V component first, high probability because V stands for red color and human skin colors have more weight on this component
+ for (int32_t i = 0; i < 4; i++) {
+ if (iNeighbourForegroundFlags & kaOUPos[i]) {
+ uint8_t* pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
+ uint8_t* pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
+ if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
+ return 1;
+ }
+ }
+ }
+ // U component, which stands for blue color, low probability
+ for (int32_t i = 0; i < 4; i++) {
+ if (iNeighbourForegroundFlags & kaOUPos[i]) {
+ uint8_t* pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
+ uint8_t* pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
+ if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+inline void CBackgroundDetection::ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[],
+ vBGDParam* pBgdParam, int32_t iChromaSampleStartPos) {
+ int32_t iPicStrideUV = pBgdParam->iStride[1];
+ int32_t iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
+ pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+
+ if (pBackgroundOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
+ switch (iSumNeighBackgroundFlags) {
+ case 0:
+ case 1:
+ pBackgroundOU->iBackgroundFlag = 0;
+ break;
+ case 2:
+ case 3:
+ pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
+
+ // chroma component check
+ if (pBackgroundOU->iBackgroundFlag == 1) {
+ int8_t iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag) << 1)
+ | ((!pOUNeighbours[2]->iBackgroundFlag) << 2) | ((!pOUNeighbours[3]->iBackgroundFlag) << 3);
+ pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma (iNeighbourForegroundFlags, iChromaSampleStartPos,
+ iPicStrideUV, pBgdParam);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+}
+inline void CBackgroundDetection::BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]) {
+ if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
+ int32_t iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
+ pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
+ int32_t sumNbrBGsad = (pOUNeighbours[0]->iSAD & (-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD &
+ (-pOUNeighbours[2]->iBackgroundFlag))
+ + (pOUNeighbours[1]->iSAD & (-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD &
+ (-pOUNeighbours[3]->iBackgroundFlag));
+ if (pBackgroundOU->iSAD * iSumNeighBackgroundFlags <= (3 * sumNbrBGsad) >> 1) {
+ if (iSumNeighBackgroundFlags == 4) {
+ pBackgroundOU->iBackgroundFlag = 1;
+ } else {
+ if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag)
+ || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag)) {
+ pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
+ }
+ }
+ }
+ }
+}
+
+inline void CBackgroundDetection::SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb,
+ int32_t iBackgroundMbFlag) {
+ *pBackgroundMbFlag = iBackgroundMbFlag;
+}
+
+inline void CBackgroundDetection::UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag,
+ int32_t iPicWidthInOU, int32_t iPicWidthInMb) {
+ if (pCurOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
+ SBackgroundOU* pOU_L = pCurOU - 1;
+ SBackgroundOU* pOU_R = pCurOU + 1;
+ SBackgroundOU* pOU_U = pCurOU - iPicWidthInOU;
+ SBackgroundOU* pOU_D = pCurOU + iPicWidthInOU;
+ if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1) {
+ SetBackgroundMbFlag (pBackgroundMbFlag, iPicWidthInMb, 0);
+ pCurOU->iBackgroundFlag = 0;
+ }
+ }
+}
+
+void CBackgroundDetection::ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam) {
+ int32_t iPicStrideUV = pBgdParam->iStride[1];
+ int32_t iPicWidthInOU = pBgdParam->iBgdWidth >> LOG2_BGD_OU_SIZE;
+ int32_t iPicHeightInOU = pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
+ int32_t iOUStrideUV = iPicStrideUV << (LOG2_BGD_OU_SIZE - 1);
+ int32_t iPicWidthInMb = (15 + pBgdParam->iBgdWidth) >> 4;
+
+ SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
+ int8_t* pVaaBackgroundMbFlag = (int8_t*)pBgdParam->pBackgroundMbFlag;
+ SBackgroundOU* pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
+
+ pBackgroundOU = pBgdParam->pOU_array;
+ pOUNeighbours[2] = pBackgroundOU;//top OU
+ for (int32_t j = 0; j < iPicHeightInOU; j ++) {
+ int8_t* pRowSkipFlag = pVaaBackgroundMbFlag;
+ pOUNeighbours[0] = pBackgroundOU;//left OU
+ pOUNeighbours[3] = pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU - 1) - 1)); //bottom OU
+ for (int32_t i = 0; i < iPicWidthInOU; i++) {
+ pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU - 1); //right OU
+
+ if (pBackgroundOU->iBackgroundFlag)
+ ForegroundDilation (pBackgroundOU, pOUNeighbours, pBgdParam, j * iOUStrideUV + (i << LOG2_BGD_OU_SIZE_UV));
+ else
+ BackgroundErosion (pBackgroundOU, pOUNeighbours);
+
+ // check the up OU
+ if (j > 1 && i > 0 && i < iPicWidthInOU - 1 && pOUNeighbours[2]->iBackgroundFlag == 1) {
+ UpperOUForegroundCheck (pOUNeighbours[2], pRowSkipFlag - OU_SIZE_IN_MB * iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
+ }
+
+ SetBackgroundMbFlag (pRowSkipFlag, iPicWidthInMb, pBackgroundOU->iBackgroundFlag);
+
+ // preparation for the next OU
+ pRowSkipFlag += OU_SIZE_IN_MB;
+ pOUNeighbours[0] = pBackgroundOU;
+ pOUNeighbours[2]++;
+ pOUNeighbours[3]++;
+ pBackgroundOU++;
+ }
+ pOUNeighbours[2] = pBackgroundOU - iPicWidthInOU;
+ pVaaBackgroundMbFlag += OU_SIZE_IN_MB * iPicWidthInMb;
+ }
+}
+
+void CBackgroundDetection::BackgroundDetection (vBGDParam* pBgdParam) {
+ // 1st step: foreground/background coarse division
+ ForegroundBackgroundDivision (pBgdParam);
+
+ // 2nd step: foreground dilation and background erosion
+ ForegroundDilationAndBackgroundErosion (pBgdParam);
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/backgounddetection/BackgroundDetection.h
@@ -1,0 +1,106 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : BackgroundDetection.h
+ *
+ * \brief : background detection class of wels video processor class
+ *
+ * \date : 2011/03/17
+ *
+ * \description : 1. rewrite the package code of background detection class
+ *
+ */
+
+#ifndef WELSVP_BACKGROUNDDETECTION_H
+#define WELSVP_BACKGROUNDDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef struct {
+ int32_t iBackgroundFlag;
+ int32_t iSAD;
+ int32_t iSD;
+ int32_t iMAD;
+ int32_t iMinSubMad;
+ int32_t iMaxDiffSubSd;
+} SBackgroundOU;
+
+class CBackgroundDetection : public IStrategy {
+ public:
+ CBackgroundDetection (int32_t iCpuFlag);
+ ~CBackgroundDetection();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+ EResult Set (int32_t iType, void* pParam);
+
+ private:
+ struct vBGDParam {
+ uint8_t* pCur[3];
+ uint8_t* pRef[3];
+ int32_t iBgdWidth;
+ int32_t iBgdHeight;
+ int32_t iStride[3];
+ SBackgroundOU* pOU_array;
+ int8_t* pBackgroundMbFlag;
+ SVAACalcResult* pCalcRes;
+ } m_BgdParam;
+
+ int32_t m_iLargestFrameSize;
+
+ private:
+ inline SBackgroundOU* AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight);
+ inline void FreeOUArrayMemory();
+ inline int32_t CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride);
+ inline bool_t ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
+ SBackgroundOU* pOUNeighbours[]); //Foreground_Dilation_2_3_Luma
+ inline bool_t ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos,
+ int32_t iPicStrideUV, vBGDParam* pBgdParam);//Foreground_Dilation_2_3_Chroma
+ inline void ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[], vBGDParam* pBgdParam,
+ int32_t iChromaSampleStartPos);
+ inline void BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]);
+ inline void SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
+ inline void UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag, int32_t iPicWidthInOU,
+ int32_t iPicWidthInMb);
+
+ void GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
+ SBackgroundOU* pBackgroundOU);
+ void ForegroundBackgroundDivision (vBGDParam* pBgdParam);
+ void ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam);
+ void BackgroundDetection (vBGDParam* pBgdParam);
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/WelsFrameWork.cpp
@@ -1,0 +1,301 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+#include "cpu.h"
+#include "../denoise/denoise.h"
+#include "../downsample/downsample.h"
+#include "../scenechangedetection/SceneChangeDetection.h"
+#include "../vaacalc/vaacalculation.h"
+#include "../backgounddetection/BackgroundDetection.h"
+#include "../adaptivequantization/AdaptiveQuantization.h"
+#include "../complexityanalysis/ComplexityAnalysis.h"
+#include "../imagerotate/imagerotate.h"
+
+
+/* interface API implement */
+
+EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion) {
+ if (iVersion & 0x8000)
+ return nsWelsVP::CreateSpecificVpInterface ((IWelsVP**)ppCtx);
+ else if (iVersion & 0x7fff)
+ return nsWelsVP::CreateSpecificVpInterface ((IWelsVPc**)ppCtx);
+ else
+ return RET_INVALIDPARAM;
+}
+
+EResult WELSAPI DestroyVpInterface (void* pCtx, int iVersion) {
+ if (iVersion & 0x8000)
+ return nsWelsVP::DestroySpecificVpInterface ((IWelsVP*)pCtx);
+ else if (iVersion & 0x7fff)
+ return nsWelsVP::DestroySpecificVpInterface ((IWelsVPc*)pCtx);
+ else
+ return RET_INVALIDPARAM;
+}
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface (IWelsVP** ppCtx) {
+ EResult eReturn = RET_FAILED;
+
+ CVpFrameWork* pFr = new CVpFrameWork (1, eReturn);
+ if (pFr) {
+ *ppCtx = (IWelsVP*)pFr;
+ eReturn = RET_SUCCESS;
+ }
+
+ return eReturn;
+}
+
+EResult DestroySpecificVpInterface (IWelsVP* pCtx) {
+ _SafeDelete (pCtx);
+
+ return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+CVpFrameWork::CVpFrameWork (uint32_t uiThreadsNum, EResult& eReturn) {
+ int32_t iCoreNum = 1;
+#ifndef X86_ASM
+ uint32_t uiCPUFlag = 0;
+#else
+ uint32_t uiCPUFlag = WelsCPUFeatureDetect (&iCoreNum);
+#endif
+
+ for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
+ IStrategy* pStrategy = m_pStgChain[i];
+ pStrategy = CreateStrategy (WelsStaticCast (EMethods, i + 1), uiCPUFlag);
+ m_pStgChain[i] = pStrategy;
+ }
+
+ WelsMutexInit (&m_mutes);
+
+ eReturn = RET_SUCCESS;
+}
+
+CVpFrameWork::~CVpFrameWork() {
+ for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
+ if (m_pStgChain[i]) {
+ Uninit (m_pStgChain[i]->m_eMethod);
+ _SafeDelete (m_pStgChain[i]);
+ }
+ }
+
+ WelsMutexDestroy (&m_mutes);
+}
+
+EResult CVpFrameWork::Init (int32_t iType, void* pCfg) {
+ EResult eReturn = RET_SUCCESS;
+ int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+ Uninit (iType);
+
+ WelsMutexLock (&m_mutes);
+
+ IStrategy* pStrategy = m_pStgChain[iCurIdx];
+ if (pStrategy)
+ eReturn = pStrategy->Init (0, pCfg);
+
+ WelsMutexUnlock (&m_mutes);
+
+ return eReturn;
+}
+
+EResult CVpFrameWork::Uninit (int32_t iType) {
+ EResult eReturn = RET_SUCCESS;
+ int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+ WelsMutexLock (&m_mutes);
+
+ IStrategy* pStrategy = m_pStgChain[iCurIdx];
+ if (pStrategy)
+ eReturn = pStrategy->Uninit (0);
+
+ WelsMutexUnlock (&m_mutes);
+
+ return eReturn;
+}
+
+EResult CVpFrameWork::Flush (int32_t iType) {
+ EResult eReturn = RET_SUCCESS;
+
+ return eReturn;
+}
+
+EResult CVpFrameWork::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
+ EResult eReturn = RET_NOTSUPPORTED;
+ EMethods eMethod = WelsVpGetValidMethod (iType);
+ int32_t iCurIdx = WelsStaticCast (int32_t, eMethod) - 1;
+ SPixMap sSrcPic;
+ SPixMap sDstPic;
+ memset (&sSrcPic, 0, sizeof (sSrcPic)); // confirmed_safe_unsafe_usage
+ memset (&sDstPic, 0, sizeof (sDstPic)); // confirmed_safe_unsafe_usage
+
+ if (pSrcPixMap) sSrcPic = *pSrcPixMap;
+ if (pDstPixMap) sDstPic = *pDstPixMap;
+ if (!CheckValid (eMethod, sSrcPic, sDstPic))
+ return RET_INVALIDPARAM;
+
+ WelsMutexLock (&m_mutes);
+
+ IStrategy* pStrategy = m_pStgChain[iCurIdx];
+ if (pStrategy)
+ eReturn = pStrategy->Process (0, &sSrcPic, &sDstPic);
+
+ WelsMutexUnlock (&m_mutes);
+
+ return eReturn;
+}
+
+EResult CVpFrameWork::Get (int32_t iType, void* pParam) {
+ EResult eReturn = RET_SUCCESS;
+ int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+ if (!pParam)
+ return RET_INVALIDPARAM;
+
+ WelsMutexLock (&m_mutes);
+
+ IStrategy* pStrategy = m_pStgChain[iCurIdx];
+ if (pStrategy)
+ eReturn = pStrategy->Get (0, pParam);
+
+ WelsMutexUnlock (&m_mutes);
+
+ return eReturn;
+}
+
+EResult CVpFrameWork::Set (int32_t iType, void* pParam) {
+ EResult eReturn = RET_SUCCESS;
+ int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
+
+ if (!pParam)
+ return RET_INVALIDPARAM;
+
+ WelsMutexLock (&m_mutes);
+
+ IStrategy* pStrategy = m_pStgChain[iCurIdx];
+ if (pStrategy)
+ eReturn = pStrategy->Set (0, pParam);
+
+ WelsMutexUnlock (&m_mutes);
+
+ return eReturn;
+}
+
+EResult CVpFrameWork::SpecialFeature (int32_t iType, void* pIn, void* pOut) {
+ EResult eReturn = RET_SUCCESS;
+
+ return eReturn;
+}
+
+bool_t CVpFrameWork::CheckValid (EMethods eMethod, SPixMap& pSrcPixMap, SPixMap& pDstPixMap) {
+ bool_t eReturn = FALSE;
+
+ if (eMethod == METHOD_NULL)
+ goto exit;
+
+ if (eMethod != METHOD_COLORSPACE_CONVERT) {
+ if (pSrcPixMap.pPixel[0]) {
+ if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
+ goto exit;
+ }
+ if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0]) {
+ if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
+ goto exit;
+ }
+ }
+
+ if (pSrcPixMap.pPixel[0]) {
+ if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0
+ || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
+ goto exit;
+ if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight
+ || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
+ goto exit;
+ }
+ if (pDstPixMap.pPixel[0]) {
+ if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0
+ || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
+ goto exit;
+ if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight
+ || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
+ goto exit;
+ }
+ eReturn = TRUE;
+
+exit:
+ return eReturn;
+}
+
+IStrategy* CVpFrameWork::CreateStrategy (EMethods m_eMethod, int32_t iCpuFlag) {
+ IStrategy* pStrategy = NULL;
+
+ switch (m_eMethod) {
+ case METHOD_COLORSPACE_CONVERT:
+ //not support yet
+ break;
+ case METHOD_DENOISE:
+ pStrategy = WelsDynamicCast (IStrategy*, new CDenoiser (iCpuFlag));
+ break;
+ case METHOD_SCENE_CHANGE_DETECTION:
+ pStrategy = WelsDynamicCast (IStrategy*, new CSceneChangeDetection (iCpuFlag));
+ break;
+ case METHOD_DOWNSAMPLE:
+ pStrategy = WelsDynamicCast (IStrategy*, new CDownsampling (iCpuFlag));
+ break;
+ case METHOD_VAA_STATISTICS:
+ pStrategy = WelsDynamicCast (IStrategy*, new CVAACalculation (iCpuFlag));
+ break;
+ case METHOD_BACKGROUND_DETECTION:
+ pStrategy = WelsDynamicCast (IStrategy*, new CBackgroundDetection (iCpuFlag));
+ break;
+ case METHOD_ADAPTIVE_QUANT:
+ pStrategy = WelsDynamicCast (IStrategy*, new CAdaptiveQuantization (iCpuFlag));
+ break;
+ case METHOD_COMPLEXITY_ANALYSIS:
+ pStrategy = WelsDynamicCast (IStrategy*, new CComplexityAnalysis (iCpuFlag));
+ break;
+ case METHOD_IMAGE_ROTATE:
+ pStrategy = WelsDynamicCast (IStrategy*, new CImageRotating (iCpuFlag));
+ break;
+ default:
+ break;
+ }
+
+ return pStrategy;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/common/WelsFrameWork.h
@@ -1,0 +1,130 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : WelsFrameWork.h
+ *
+ * \brief : framework of wels video processor class
+ *
+ * \date : 2011/01/04
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_WELSFRAMEWORK_H
+#define WELSVP_WELSFRAMEWORK_H
+
+#include "../../interface/IWelsVP.h"
+#include "util.h"
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult CreateSpecificVpInterface (IWelsVP** ppCtx);
+EResult DestroySpecificVpInterface (IWelsVP* pCtx);
+
+EResult CreateSpecificVpInterface (IWelsVPc** ppCtx);
+EResult DestroySpecificVpInterface (IWelsVPc* pCtx);
+
+#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
+
+class IStrategy : public IWelsVP {
+ public:
+ IStrategy() {
+ m_eMethod = METHOD_NULL;
+ m_eFormat = VIDEO_FORMAT_I420;
+ m_iIndex = 0;
+ m_bInit = FALSE;
+ };
+
+ virtual ~IStrategy() {}
+
+ public:
+ virtual EResult Init (int32_t iType, void* pCfg) {
+ return RET_SUCCESS;
+ }
+ virtual EResult Uninit (int32_t iType) {
+ return RET_SUCCESS;
+ }
+ virtual EResult Flush (int32_t iType) {
+ return RET_SUCCESS;
+ }
+ virtual EResult Get (int32_t iType, void* pParam) {
+ return RET_SUCCESS;
+ }
+ virtual EResult Set (int32_t iType, void* pParam) {
+ return RET_SUCCESS;
+ }
+ virtual EResult SpecialFeature (int32_t iType, void* pIn, void* pOut) {
+ return RET_SUCCESS;
+ }
+ virtual EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) = 0;
+
+ public:
+ EMethods m_eMethod;
+ EVideoFormat m_eFormat;
+ int32_t m_iIndex;
+ bool_t m_bInit;
+};
+
+class CVpFrameWork : public IWelsVP {
+ public:
+ CVpFrameWork (uint32_t uiThreadsNum, EResult& ret);
+ ~CVpFrameWork();
+
+ public:
+ EResult Init (int32_t iType, void* pCfg);
+
+ EResult Uninit (int32_t iType);
+
+ EResult Flush (int32_t iType);
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ EResult Get (int32_t iType, void* pParam);
+
+ EResult Set (int32_t iType, void* pParam);
+
+ EResult SpecialFeature (int32_t iType, void* pIn, void* pOut);
+
+ private:
+ bool_t CheckValid (EMethods eMethod, SPixMap& sSrc, SPixMap& sDst);
+ IStrategy* CreateStrategy (EMethods eMethod, int32_t iCpuFlag);
+
+ private:
+ IStrategy* m_pStgChain[MAX_STRATEGY_NUM];
+
+ WELS_MUTEX m_mutes;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/WelsFrameWorkEx.cpp
@@ -1,0 +1,96 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "WelsFrameWork.h"
+
+///////////////////////////////////////////////////////////////////////
+
+WELSVP_NAMESPACE_BEGIN
+
+EResult Init (void* pCtx, int32_t iType, void* pCfg) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Init (iType, pCfg) : RET_INVALIDPARAM;
+}
+EResult Uninit (void* pCtx, int32_t iType) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Uninit (iType) : RET_INVALIDPARAM;
+}
+EResult Flush (void* pCtx, int32_t iType) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Flush (iType) : RET_INVALIDPARAM;
+}
+EResult Process (void* pCtx, int32_t iType, SPixMap* pSrc, SPixMap* dst) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Process (iType, pSrc, dst) : RET_INVALIDPARAM;
+}
+EResult Get (void* pCtx, int32_t iType, void* pParam) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Get (iType, pParam) : RET_INVALIDPARAM;
+}
+EResult Set (void* pCtx, int32_t iType, void* pParam) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Set (iType, pParam) : RET_INVALIDPARAM;
+}
+EResult SpecialFeature (void* pCtx, int32_t iType, void* pIn, void* pOut) {
+ return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->SpecialFeature (iType, pIn, pOut) : RET_INVALIDPARAM;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+EResult CreateSpecificVpInterface (IWelsVPc** pCtx) {
+ EResult ret = RET_FAILED;
+ IWelsVP* pWelsVP = NULL;
+
+ ret = CreateSpecificVpInterface (&pWelsVP);
+ if (ret == RET_SUCCESS) {
+ IWelsVPc* pVPc = new IWelsVPc;
+ if (pVPc) {
+ pVPc->Init = Init;
+ pVPc->Uninit = Uninit;
+ pVPc->Flush = Flush;
+ pVPc->Process = Process;
+ pVPc->Get = Get;
+ pVPc->Set = Set;
+ pVPc->SpecialFeature = SpecialFeature;
+ pVPc->pCtx = WelsStaticCast (void*, pWelsVP);
+ *pCtx = pVPc;
+ } else
+ ret = RET_OUTOFMEMORY;
+ }
+
+ return ret;
+}
+
+EResult DestroySpecificVpInterface (IWelsVPc* pCtx) {
+ if (pCtx) {
+ DestroySpecificVpInterface (WelsStaticCast (IWelsVP*, pCtx->pCtx));
+ _SafeDelete (pCtx);
+ }
+
+ return RET_SUCCESS;
+}
+
+WELSVP_NAMESPACE_END
binary files /dev/null b/codec/processing/src/common/WelsVP.aps differ
--- /dev/null
+++ b/codec/processing/src/common/WelsVP.def
@@ -1,0 +1,36 @@
+;*!
+;* \copy
+;* Copyright (c) 2011-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY welsvp.dll
+EXPORTS
+ CreateVpInterface PRIVATE
+ DestroyVpInterface PRIVATE
\ No newline at end of file
--- /dev/null
+++ b/codec/processing/src/common/WelsVP.rc
@@ -1,0 +1,115 @@
+// Microsoft Visual C++ generated resource script.
+//
+#include "resource.h"
+
+#define APSTUDIO_READONLY_SYMBOLS
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 2 resource.
+//
+#include "windows.h"
+
+/////////////////////////////////////////////////////////////////////////////
+#undef APSTUDIO_READONLY_SYMBOLS
+
+/////////////////////////////////////////////////////////////////////////////
+// Chinese (P.R.C.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
+#ifdef _WIN32
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+#pragma code_page(936)
+#endif //_WIN32
+
+#ifdef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// TEXTINCLUDE
+//
+
+1 TEXTINCLUDE
+BEGIN
+ "resource.h\0"
+END
+
+2 TEXTINCLUDE
+BEGIN
+ "#include ""windows.h""\r\n"
+ "\0"
+END
+
+3 TEXTINCLUDE
+BEGIN
+ "\r\n"
+ "\0"
+END
+
+#endif // APSTUDIO_INVOKED
+
+#endif // Chinese (P.R.C.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+/////////////////////////////////////////////////////////////////////////////
+// English (U.S.) resources
+
+#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
+#ifdef _WIN32
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+#pragma code_page(1252)
+#endif //_WIN32
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Version
+//
+
+VS_VERSION_INFO VERSIONINFO
+ FILEVERSION 0,0,0,0
+ PRODUCTVERSION 0,0,0,0
+ FILEFLAGSMASK 0x3fL
+#ifdef _DEBUG
+ FILEFLAGS 0x1L
+#else
+ FILEFLAGS 0x0L
+#endif
+ FILEOS 0x40004L
+ FILETYPE 0x2L
+ FILESUBTYPE 0x0L
+BEGIN
+ BLOCK "StringFileInfo"
+ BEGIN
+ BLOCK "040904b0"
+ BEGIN
+ VALUE "Comments", "Cisco OpenH264 video preprocessing"
+ VALUE "CompanyName", "Cisco Systems"
+ VALUE "FileDescription", "Cisco OpenH264 video preprocessing"
+ VALUE "FileVersion", "0, 0, 0, 0"
+ VALUE "InternalName", "welsvp.dll"
+ VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
+ VALUE "OriginalFilename", "welsvp.dll"
+ VALUE "ProductName", "Cisco OpenH264 video preprocessing"
+ VALUE "ProductVersion", "0, 0, 0, 0"
+ END
+ END
+ BLOCK "VarFileInfo"
+ BEGIN
+ VALUE "Translation", 0x409, 1200
+ END
+END
+
+#endif // English (U.S.) resources
+/////////////////////////////////////////////////////////////////////////////
+
+
+
+#ifndef APSTUDIO_INVOKED
+/////////////////////////////////////////////////////////////////////////////
+//
+// Generated from the TEXTINCLUDE 3 resource.
+//
+
+
+/////////////////////////////////////////////////////////////////////////////
+#endif // not APSTUDIO_INVOKED
+
--- /dev/null
+++ b/codec/processing/src/common/cpu.cpp
@@ -1,0 +1,196 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file cpu.c
+ *
+ * \brief CPU compatibility detection
+ *
+ * \date 04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "util.h"
+#include "cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define CPU_Vender_AMD "AuthenticAMD"
+#define CPU_Vender_INTEL "GenuineIntel"
+#define CPU_Vender_CYRIX "CyrixInstead"
+
+
+#if defined(X86_ASM)
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
+ uint32_t uiCPU = 0;
+ uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
+ int32_t CacheLineSize = 0;
+ int8_t chVenderName[16] = { 0 };
+
+ if (!WelsCPUIdVerify()) {
+ /* cpuid is not supported in cpu */
+ return 0;
+ }
+
+ WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
+ if (uiFeatureA == 0) {
+ /* maximum input value for basic cpuid information */
+ return 0;
+ }
+
+ WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ if ((uiFeatureD & 0x00800000) == 0) {
+ /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
+ return 0;
+ }
+
+ uiCPU = WELS_CPU_MMX;
+ if (uiFeatureD & 0x02000000) {
+ /* SSE technology is identical to AMD MMX extensions */
+ uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
+ }
+ if (uiFeatureD & 0x04000000) {
+ /* SSE2 support here */
+ uiCPU |= WELS_CPU_SSE2;
+ }
+ if (uiFeatureD & 0x00000001) {
+ /* x87 FPU on-chip checking */
+ uiCPU |= WELS_CPU_FPU;
+ }
+ if (uiFeatureD & 0x00008000) {
+ /* CMOV instruction checking */
+ uiCPU |= WELS_CPU_CMOV;
+ }
+ if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) { // confirmed_safe_unsafe_usage
+ if (uiFeatureD & 0x10000000) {
+ /* Multi-Threading checking: contains of multiple logic processors */
+ uiCPU |= WELS_CPU_HTT;
+ }
+ }
+
+ if (uiFeatureC & 0x00000001) {
+ /* SSE3 support here */
+ uiCPU |= WELS_CPU_SSE3;
+ }
+ if (uiFeatureC & 0x00000200) {
+ /* SSSE3 support here */
+ uiCPU |= WELS_CPU_SSSE3;
+ }
+ if (uiFeatureC & 0x00080000) {
+ /* SSE4.1 support here, 45nm Penryn processor */
+ uiCPU |= WELS_CPU_SSE41;
+ }
+ if (uiFeatureC & 0x00100000) {
+ /* SSE4.2 support here, next generation Nehalem processor */
+ uiCPU |= WELS_CPU_SSE42;
+ }
+ if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) { //
+ /* AVX supported */
+ uiCPU |= WELS_CPU_AVX;
+ }
+ if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) { //
+ /* AVX FMA supported */
+ uiCPU |= WELS_CPU_FMA;
+ }
+ if (uiFeatureC & 0x02000000) {
+ /* AES checking */
+ uiCPU |= WELS_CPU_AES;
+ }
+ if (uiFeatureC & 0x00400000) {
+ /* MOVBE checking */
+ uiCPU |= WELS_CPU_MOVBE;
+ }
+
+ if (pNumberOfLogicProcessors != NULL) {
+ // HTT enabled on chip
+ *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
+ }
+
+ WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+ if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
+ && (uiFeatureA >= 0x80000001)) { // confirmed_safe_unsafe_usage
+ WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ if (uiFeatureD & 0x00400000) {
+ uiCPU |= WELS_CPU_MMXEXT;
+ }
+ if (uiFeatureD & 0x80000000) {
+ uiCPU |= WELS_CPU_3DNOW;
+ }
+ }
+
+ if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) { // confirmed_safe_unsafe_usage
+ int32_t family, model;
+
+ WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+ family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
+ model = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
+
+ if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
+ uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
+ }
+ }
+
+ // get cache line size
+ if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
+ || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) { // confirmed_safe_unsafe_usage
+ WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
+
+ CacheLineSize = (uiFeatureB & 0xff00) >>
+ 5; // ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
+
+ if (CacheLineSize == 128) {
+ uiCPU |= WELS_CPU_CACHELINE_128;
+ } else if (CacheLineSize == 64) {
+ uiCPU |= WELS_CPU_CACHELINE_64;
+ } else if (CacheLineSize == 32) {
+ uiCPU |= WELS_CPU_CACHELINE_32;
+ } else if (CacheLineSize == 16) {
+ uiCPU |= WELS_CPU_CACHELINE_16;
+ }
+ }
+
+ return uiCPU;
+}
+
+
+void WelsCPURestore (const uint32_t kuiCPU) {
+ if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
+ WelsEmms();
+ }
+}
+
+#endif
+
+
+WELSVP_NAMESPACE_END
+
+
--- /dev/null
+++ b/codec/processing/src/common/cpu.h
@@ -1,0 +1,102 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file cpu.h
+ *
+ * \brief CPU feature compatibility detection
+ *
+ * \date 04/29/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_CPU_H
+#define WELSVP_CPU_H
+
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+/*
+ * WELS CPU feature flags
+ */
+#define WELS_CPU_MMX 0x00000001 /* mmx */
+#define WELS_CPU_MMXEXT 0x00000002 /* mmx-ext*/
+#define WELS_CPU_SSE 0x00000004 /* sse */
+#define WELS_CPU_SSE2 0x00000008 /* sse 2 */
+#define WELS_CPU_SSE3 0x00000010 /* sse 3 */
+#define WELS_CPU_SSE41 0x00000020 /* sse 4.1 */
+#define WELS_CPU_3DNOW 0x00000040 /* 3dnow! */
+#define WELS_CPU_3DNOWEXT 0x00000080 /* 3dnow! ext */
+#define WELS_CPU_ALTIVEC 0x00000100 /* altivec */
+#define WELS_CPU_SSSE3 0x00000200 /* ssse3 */
+#define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */
+
+/* CPU features application extensive */
+#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
+#define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */
+#define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
+ physical processor package is capable of supporting more than one logic processor
+ */
+#define WELS_CPU_CMOV 0x00004000 /* Conditional Move Instructions,
+ also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
+ */
+#define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */
+#define WELS_CPU_AES 0x00010000 /* AES instruction extensions */
+#define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */
+
+#define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */
+#define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */
+#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
+#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
+
+/*
+ * Interfaces for CPU core feature detection as below
+ */
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+
+int32_t WelsCPUIdVerify();
+
+void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
+int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
+int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
+
+void WelsEmms();
+
+WELSVP_EXTERN_C_END
+#endif
+
+uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/memory.cpp
@@ -1,0 +1,117 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "memory.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+void* WelsMalloc (const uint32_t kuiSize, str_t* pTag) {
+ const int32_t kiSizeVoidPointer = sizeof (void**);
+ const int32_t kiSizeInt32 = sizeof (int32_t);
+ const int32_t kiAlignedBytes = ALIGNBYTES - 1;
+
+ uint8_t* pBuf = (uint8_t*) ::malloc (kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
+ uint8_t* pAlignedBuf = NULL;
+
+ if (NULL == pBuf)
+ return NULL;
+
+ // to fill zero values
+ WelsMemset (pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
+
+ pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
+ pAlignedBuf -= WelsCastFromPointer (pAlignedBuf) & kiAlignedBytes;
+ * ((void**) (pAlignedBuf - kiSizeVoidPointer)) = pBuf;
+ * ((int32_t*) (pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32))) = kuiSize;
+
+ return (pAlignedBuf);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void WelsFree (void* pPointer, str_t* pTag) {
+ if (pPointer) {
+ ::free (* (((void**) pPointer) - 1));
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void* InternalReallocate (void* pPointer, const uint32_t kuiSize, str_t* pTag) {
+ uint32_t iOldSize = 0;
+ uint8_t* pNew = NULL;
+ if (pPointer != NULL)
+ iOldSize = * ((int32_t*) ((uint8_t*) pPointer - sizeof (void**) - sizeof (int32_t)));
+ else
+ return WelsMalloc (kuiSize, pTag);
+
+ pNew = (uint8_t*)WelsMalloc (kuiSize, pTag);
+ if (0 == pNew) {
+ if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
+ return (pPointer);
+ return 0;
+ } else if (iOldSize > 0 && kuiSize > 0)
+ memcpy (pNew, pPointer, (iOldSize < kuiSize) ? iOldSize : kuiSize);
+ else
+ return 0;
+
+ WelsFree (pPointer, pTag);
+ return (pNew);
+}
+
+/////////////////////////////////////////////////////////////////////////////
+
+void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag) {
+ const uint32_t kuiOldSize = *pRealSize;
+ uint32_t kuiNewSize = 0;
+ void* pLocalPointer = NULL;
+ if (kuiOldSize >= kuiSize) // large enough of original block, so do nothing
+ return (pPointer);
+
+ // new request
+ kuiNewSize = kuiSize + 15;
+ kuiNewSize -= (kuiNewSize & 15);
+ kuiNewSize += 32;
+
+ pLocalPointer = InternalReallocate (pPointer, kuiNewSize, pTag);
+ if (NULL != pLocalPointer) {
+ *pRealSize = kuiNewSize;
+ return (pLocalPointer);
+ } else {
+ return NULL;
+ }
+
+ return NULL; // something wrong
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/common/memory.h
@@ -1,0 +1,110 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : memory.h
+ *
+ * \brief : memory definition for wels video processor class
+ *
+ * \date : 2011/02/22
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_MEMORY_H
+#define WELSVP_MEMORY_H
+
+#include "util.h"
+#include "typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+inline_t void* WelsMemset (void* pPointer, int32_t iValue, uint32_t uiSize) {
+ return ::memset (pPointer, iValue, uiSize);
+}
+
+inline_t void* WelsMemcpy (void* pDst, const void* kpSrc, uint32_t uiSize) {
+ return ::memcpy (pDst, kpSrc, uiSize);
+}
+
+inline_t int32_t WelsMemcmp (const void* kpBuf1, const void* kpBuf2, uint32_t uiSize) {
+ return ::memcmp (kpBuf1, kpBuf2, uiSize);
+}
+
+/*!
+*************************************************************************************
+* \brief malloc with zero filled utilization in Wels
+*
+* \param i_size uiSize of memory block required
+*
+* \return allocated memory pointer exactly, failed in case of NULL return
+*
+* \note N/A
+*************************************************************************************
+*/
+void* WelsMalloc (const uint32_t kuiSize, str_t* pTag = NULL);
+
+/*!
+*************************************************************************************
+* \brief free utilization in Wels
+*
+* \param p data pointer to be free.
+* i.e, uint8_t *p = actual data to be free, argv = &p.
+*
+* \return NONE
+*
+* \note N/A
+*************************************************************************************
+*/
+void WelsFree (void* pPointer, str_t* pTag = NULL);
+
+/*!
+*************************************************************************************
+* \brief reallocation in Wels. Do nothing and continue using old block
+* in case the block is large enough currently
+*
+* \param p memory block required in old time
+* \param i_size new uiSize of memory block requested
+* \param sz_real pointer to the old uiSize of memory block
+*
+* \return reallocated memory pointer exactly, failed in case of NULL return
+*
+* \note N/A
+*************************************************************************************
+*/
+void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag = NULL);
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- /dev/null
+++ b/codec/processing/src/common/resource.h
@@ -1,0 +1,15 @@
+//{{NO_DEPENDENCIES}}
+// Microsoft Visual C++ generated include file.
+// Used by WelsVP.rc
+//
+
+// Next default values for new objects
+//
+#ifdef APSTUDIO_INVOKED
+#ifndef APSTUDIO_READONLY_SYMBOLS
+#define _APS_NEXT_RESOURCE_VALUE 101
+#define _APS_NEXT_COMMAND_VALUE 40001
+#define _APS_NEXT_CONTROL_VALUE 1000
+#define _APS_NEXT_SYMED_VALUE 101
+#endif
+#endif
--- /dev/null
+++ b/codec/processing/src/common/thread.cpp
@@ -1,0 +1,93 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file thread.cpp
+ *
+ * \brief Interfaces introduced in thread programming
+ *
+ * \date 11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#include "thread.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+
+WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
+ InitializeCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
+ EnterCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
+ LeaveCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
+ DeleteCriticalSection (mutex);
+
+ return WELS_THREAD_ERROR_OK;
+}
+
+#elif defined(__GNUC__)
+
+WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
+ return pthread_mutex_init (mutex, NULL);
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
+ return pthread_mutex_lock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
+ return pthread_mutex_unlock (mutex);
+}
+
+WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
+ return pthread_mutex_destroy (mutex);
+}
+
+#endif
+
+WELSVP_NAMESPACE_END
+
+
+
--- /dev/null
+++ b/codec/processing/src/common/thread.h
@@ -1,0 +1,89 @@
+/*!
+ * \copy
+ * Copyright (c) 2009-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file thread.h
+ *
+ * \brief Interfaces introduced in thread programming
+ *
+ * \date 11/17/2009 Created
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_THREAD_H
+#define WELSVP_THREAD_H
+
+#include "typedef.h"
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+#elif defined(__GNUC__)
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <errno.h>
+
+#endif//WIN32
+
+WELSVP_NAMESPACE_BEGIN
+
+#if defined(_WIN32)
+
+typedef HANDLE WELS_THREAD_HANDLE;
+typedef CRITICAL_SECTION WELS_MUTEX;
+
+#elif defined(__GNUC__)
+
+typedef pthread_t WELS_THREAD_HANDLE;
+typedef pthread_mutex_t WELS_MUTEX;
+
+#endif
+
+typedef long_t WELS_THREAD_ERROR_CODE;
+
+#define WELS_THREAD_ERROR_OK 0
+#define WELS_THREAD_ERROR_GENERIAL ((unsigned long)(-1))
+#define WELS_THREAD_ERROR_WAIT_OBJECT_0 0
+#define WELS_THREAD_ERROR_WAIT_TIMEOUT ((unsigned long)0x00000102L)
+#define WELS_THREAD_ERROR_WAIT_FAILED WELS_THREAD_ERROR_GENERIAL
+
+WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex);
+WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex);
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/typedef.h
@@ -1,0 +1,102 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : typedef.h
+ *
+ * \brief : basic type definition
+ *
+ * \date : 2011/01/04
+ *
+ * \description : 1. Define basic type with platform-independent;
+ * 2. Define specific namespace to avoid name pollution;
+ * 3. C++ ONLY;
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_TYPEDEF_H
+#define WELSVP_TYPEDEF_H
+
+#define WELSVP_EXTERN_C_BEGIN extern "C" {
+#define WELSVP_EXTERN_C_END }
+
+#define WELSVP_NAMESPACE_BEGIN namespace nsWelsVP {
+#define WELSVP_NAMESPACE_END }
+
+WELSVP_NAMESPACE_BEGIN
+
+#if ( defined(_WIN32) || defined(_WIN32) ) && defined(_MSC_VER)
+
+typedef char int8_t ;
+typedef unsigned char uint8_t ;
+typedef short int16_t ;
+typedef unsigned short uint16_t ;
+typedef int int32_t ;
+typedef unsigned int uint32_t ;
+typedef __int64 int64_t ;
+typedef unsigned __int64 uint64_t ;
+#define inline_t _inline
+
+#else // GCC
+
+typedef signed char int8_t
+; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit
+typedef unsigned char uint8_t ;
+typedef signed short int16_t ;
+typedef unsigned short uint16_t ;
+typedef signed int int32_t ;
+typedef unsigned int uint32_t ;
+typedef long long int64_t ;
+typedef unsigned long long uint64_t ;
+#define inline_t inline
+
+#endif
+
+typedef char str_t ; // [comment]: specific use plain char only for character parameters
+typedef long long_t ;
+typedef int32_t bool_t ;
+
+#if defined(_WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
+typedef float float_t ;
+typedef double double_t ;
+#endif
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+enum {
+ FALSE = 0,
+ TRUE = !FALSE
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/common/util.cpp
@@ -1,0 +1,45 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "util.h"
+
+WELSVP_NAMESPACE_BEGIN
+/////////////////////////////////////////////////////////////////////////////////
+
+
+int32_t WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2) {
+ return ::strcmp (kpStr1, kpStr2);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/common/util.h
@@ -1,0 +1,107 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : util.h
+ *
+ * \brief : utils for wels video processor class
+ *
+ * \date : 2011/01/04
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_UTIL_H
+#define WELSVP_UTIL_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include "typedef.h"
+#include "memory.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define MAX_WIDTH (4096)
+#define MAX_HEIGHT (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
+#define MB_WIDTH_LUMA (16)
+#define PESN (1e-6) // desired float precision
+
+#define MB_TYPE_INTRA4x4 0x00000001
+#define MB_TYPE_INTRA16x16 0x00000002
+#define MB_TYPE_INTRA_PCM 0x00000004
+#define MB_TYPE_INTRA (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
+#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
+
+#define WELS_MAX(x, y) ((x) > (y) ? (x) : (y))
+#define WELS_MIN(x, y) ((x) < (y) ? (x) : (y))
+#define WELS_SIGN(a) ((long_t)(a) >> 31)
+#define WELS_ABS(a) ((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
+#define WELS_CLAMP(x, minv, maxv) WELS_MIN(WELS_MAX(x, minv), maxv)
+
+#define ALIGNBYTES (16) /* Worst case is requiring alignment to an 16 byte boundary */
+#define WELS_ALIGN(iInput) ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
+#define WELS_ALIGN2(iInput) ((iInput+1) & ~1)
+#define WELS_ALIGN4(iInput) ((iInput+3) & ~3)
+#define WELS_ALIGN8(iInput) ((iInput+7) & ~7)
+
+#define WelsCastFromPointer(p) (reinterpret_cast<long_t>(p))
+#define WelsStaticCast(type, p) (static_cast<type>(p))
+#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
+
+#define GET_METHOD(x) ((x) & 0xff) // mask method as the lowest 8bits
+#define GET_SPECIAL(x) (((x) >> 8) & 0xff) // mask special flag as 8bits
+
+inline_t EMethods WelsVpGetValidMethod (int32_t a) {
+ int32_t iMethod = GET_METHOD (a);
+ return WelsStaticCast (EMethods, WELS_CLAMP (iMethod, METHOD_NULL + 1, METHOD_MASK - 1));
+}
+
+
+#define _SafeFree(p) if (p) { WelsFree(p); (p) = NULL; }
+#define _SafeDelete(p) if (p) { delete (p); (p) = NULL; }
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+
+int32_t WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2);
+
+
+//////////////////////////////////////////////////////////////////////////////////////
+WELSVP_NAMESPACE_END
+
+#endif
+
+
--- /dev/null
+++ b/codec/processing/src/complexityanalysis/ComplexityAnalysis.cpp
@@ -1,0 +1,304 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "ComplexityAnalysis.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
+ m_eMethod = METHOD_COMPLEXITY_ANALYSIS;
+ m_pfGomSad = NULL;
+ WelsMemset (&m_sComplexityAnalysisParam, 0, sizeof (m_sComplexityAnalysisParam));
+}
+
+CComplexityAnalysis::~CComplexityAnalysis() {
+}
+
+EResult CComplexityAnalysis::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ EResult eReturn = RET_SUCCESS;
+
+ switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode) {
+ case FRAME_SAD:
+ AnalyzeFrameComplexityViaSad (pSrcPixMap, pRefPixMap);
+ break;
+ case GOM_SAD:
+ AnalyzeGomComplexityViaSad (pSrcPixMap, pRefPixMap);
+ break;
+ case GOM_VAR:
+ AnalyzeGomComplexityViaVar (pSrcPixMap, pRefPixMap);
+ break;
+ default:
+ eReturn = RET_INVALIDPARAM;
+ break;
+ }
+
+ return eReturn;
+}
+
+
+EResult CComplexityAnalysis::Set (int32_t iType, void* pParam) {
+ if (pParam == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ m_sComplexityAnalysisParam = * (SComplexityAnalysisParam*)pParam;
+
+ return RET_SUCCESS;
+}
+
+EResult CComplexityAnalysis::Get (int32_t iType, void* pParam) {
+ if (pParam == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ SComplexityAnalysisParam* sComplexityAnalysisParam = (SComplexityAnalysisParam*)pParam;
+
+ sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
+
+ return RET_SUCCESS;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+void CComplexityAnalysis::AnalyzeFrameComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ SVAACalcResult* pVaaCalcResults = NULL;
+ pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+
+ m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
+
+ if (m_sComplexityAnalysisParam.iCalcBgd) { //BGD control
+ m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground (pSrcPixMap, pRefPixMap);
+ }
+}
+
+int32_t CComplexityAnalysis::GetFrameSadExcludeBackground (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
+ int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
+ int32_t iMbWidth = iWidth >> 4;
+ int32_t iMbHeight = iHeight >> 4;
+ int32_t iMbNum = iMbWidth * iMbHeight;
+
+ int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+ int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+ int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
+
+ uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+ uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
+ SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+ int32_t* pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+
+ uint32_t uiFrameSad = 0;
+ for (int32_t j = 0; j < iGomMbNum; j ++) {
+ iGomMbStartIndex = j * iMbNumInGom;
+ iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+
+ for (int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++) {
+ if (pBackgroundMbFlag[i] == 0 || IS_INTRA (uiRefMbType[i])) {
+ pGomForegroundBlockNum[j]++;
+ uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
+ uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
+ uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
+ uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
+ }
+ }
+ }
+
+ return (uiFrameSad);
+}
+
+
+void InitGomSadFunc (PGOMSadFunc& pfGomSad, uint8_t iCalcBgd) {
+ pfGomSad = GomSampleSad;
+
+ if (iCalcBgd) {
+ pfGomSad = GomSampleSadExceptBackground;
+ }
+}
+
+void GomSampleSad (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8, uint8_t pBackgroundMbFlag) {
+ (*pGomForegroundBlockNum) ++;
+ *pGomSad += pSad8x8[0];
+ *pGomSad += pSad8x8[1];
+ *pGomSad += pSad8x8[2];
+ *pGomSad += pSad8x8[3];
+}
+
+void GomSampleSadExceptBackground (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
+ uint8_t pBackgroundMbFlag) {
+ if (pBackgroundMbFlag == 0) {
+ (*pGomForegroundBlockNum) ++;
+ *pGomSad += pSad8x8[0];
+ *pGomSad += pSad8x8[1];
+ *pGomSad += pSad8x8[2];
+ *pGomSad += pSad8x8[3];
+ }
+}
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
+ int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
+ int32_t iMbWidth = iWidth >> 4;
+ int32_t iMbHeight = iHeight >> 4;
+ int32_t iMbNum = iMbWidth * iMbHeight;
+
+ int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+ int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+
+ int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+ int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+ int32_t iStartSampleIndex = 0;
+
+ uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
+ uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
+ SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+ int32_t* pGomForegroundBlockNum = (int32_t*)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
+ int32_t* pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
+
+ uint8_t* pRefY = NULL, *pSrcY = NULL;
+ int32_t iRefStride = 0, iCurStride = 0;
+
+ uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
+ uint32_t uiGomSad = 0, uiFrameSad = 0;
+
+ pRefY = (uint8_t*)pRefPixMap->pPixel[0];
+ pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+ iRefStride = pRefPixMap->iStride[0];
+ iCurStride = pSrcPixMap->iStride[0];
+
+ InitGomSadFunc (m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd);
+
+ for (int32_t j = 0; j < iGomMbNum; j ++) {
+ uiGomSad = 0;
+
+ iGomMbStartIndex = j * iMbNumInGom;
+ iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+ iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth - iGomMbStartIndex / iMbWidth;
+
+ iMbStartIndex = iGomMbStartIndex;
+ iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+ iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
+ MB_WIDTH_LUMA;
+
+ do {
+ pRefTmp = pRefY + iStartSampleIndex;
+ pCurTmp = pSrcY + iStartSampleIndex;
+
+ for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
+ m_pfGomSad (&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i]
+ && !IS_INTRA (uiRefMbType[i]));
+ }
+
+ iMbStartIndex = iMbEndIndex;
+ iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth , iGomMbEndIndex);
+
+ iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
+ MB_WIDTH_LUMA;
+
+ } while (--iGomMbRowNum);
+
+ pGomComplexity[j] = uiGomSad;
+ uiFrameSad += pGomComplexity[j];
+ }
+
+ m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
+}
+
+
+void CComplexityAnalysis::AnalyzeGomComplexityViaVar (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
+ int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
+ int32_t iMbWidth = iWidth >> 4;
+ int32_t iMbHeight = iHeight >> 4;
+ int32_t iMbNum = iMbWidth * iMbHeight;
+
+ int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
+ int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
+ int32_t iGomSampleNum = 0;
+
+ int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
+ int32_t iMbStartIndex = 0, iMbEndIndex = 0;
+ int32_t iStartSampleIndex = 0;
+
+ SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
+ int32_t* pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
+
+ uint8_t* pSrcY = NULL;
+ int32_t iCurStride = 0;
+
+ uint8_t* pCurTmp = NULL;
+ uint32_t uiSampleSum = 0, uiSquareSum = 0;
+
+ pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
+ iCurStride = pSrcPixMap->iStride[0];
+
+ for (int32_t j = 0; j < iGomMbNum; j ++) {
+ uiSampleSum = 0;
+ uiSquareSum = 0;
+
+ iGomMbStartIndex = j * iMbNumInGom;
+ iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
+ iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth - iGomMbStartIndex / iMbWidth;
+
+ iMbStartIndex = iGomMbStartIndex;
+ iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
+
+ iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
+ MB_WIDTH_LUMA;
+ iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
+
+ do {
+ pCurTmp = pSrcY + iStartSampleIndex;
+
+ for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
+ uiSampleSum += pVaaCalcResults->pSum16x16[i];
+ uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
+ }
+
+ iMbStartIndex = iMbEndIndex;
+ iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth, iGomMbEndIndex);
+
+ iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
+ MB_WIDTH_LUMA;
+ } while (--iGomMbRowNum);
+
+ pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
+ }
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/complexityanalysis/ComplexityAnalysis.h
@@ -1,0 +1,83 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file : ComplexityAnalysis.h
+*
+* \brief : complexity analysis class of wels video processor class
+*
+* \date : 2011/03/28
+*
+* \description : 1. rewrite the package code of complexity analysis class
+*
+*************************************************************************************
+*/
+
+#ifndef WELSVP_COMPLEXITYANALYSIS_H
+#define WELSVP_COMPLEXITYANALYSIS_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (GOMSadFunc) (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
+ uint8_t pBackgroundMbFlag);
+
+typedef GOMSadFunc* PGOMSadFunc;
+
+GOMSadFunc GomSampleSad;
+GOMSadFunc GomSampleSadExceptBackground;
+
+class CComplexityAnalysis : public IStrategy {
+ public:
+ CComplexityAnalysis (int32_t iCpuFlag);
+ ~CComplexityAnalysis();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+ EResult Set (int32_t iType, void* pParam);
+ EResult Get (int32_t iType, void* pParam);
+
+ private:
+ void AnalyzeFrameComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
+ int32_t GetFrameSadExcludeBackground (SPixMap* pSrc, SPixMap* pRef);
+
+ void AnalyzeGomComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
+ void AnalyzeGomComplexityViaVar (SPixMap* pSrc, SPixMap* pRef);
+
+ private:
+ PGOMSadFunc m_pfGomSad;
+ SComplexityAnalysisParam m_sComplexityAnalysisParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/denoise/denoise.cpp
@@ -1,0 +1,124 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "denoise.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define CALC_BI_STRIDE(iWidth, iBitcount) ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDenoiser::CDenoiser (int32_t iCpuFlag) {
+ m_CPUFlag = iCpuFlag;
+ m_eMethod = METHOD_DENOISE;
+ WelsMemset (&m_pfDenoise, 0, sizeof (m_pfDenoise));
+
+ m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
+ m_fSigmaGrey = DENOISE_GRAY_SIGMA;
+ m_uiType = DENOISE_ALL_COMPONENT;
+ InitDenoiseFunc (m_pfDenoise, m_CPUFlag);
+}
+
+CDenoiser::~CDenoiser() {
+}
+
+void CDenoiser::InitDenoiseFunc (SDenoiseFuncs& denoiser, int32_t iCpuFlag) {
+ denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
+ denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
+#if defined(X86_ASM)
+ if (iCpuFlag & WELS_CPU_SSE2) {
+ denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
+ denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;
+ }
+#endif
+}
+
+EResult CDenoiser::Process (int32_t iType, SPixMap* pSrc, SPixMap* dst) {
+ uint8_t* pSrcY = (uint8_t*)pSrc->pPixel[0];
+ uint8_t* pSrcU = (uint8_t*)pSrc->pPixel[1];
+ uint8_t* pSrcV = (uint8_t*)pSrc->pPixel[2];
+ if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ int32_t iWidthY = pSrc->sRect.iRectWidth;
+ int32_t iHeightY = pSrc->sRect.iRectHeight;
+ int32_t iWidthUV = iWidthY >> 1;
+ int32_t iHeightUV = iHeightY >> 1;
+
+ if (m_uiType & DENOISE_Y_COMPONENT)
+ BilateralDenoiseLuma (pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
+
+ if (m_uiType & DENOISE_U_COMPONENT)
+ WaverageDenoiseChroma (pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
+
+ if (m_uiType & DENOISE_V_COMPONENT)
+ WaverageDenoiseChroma (pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
+
+ return RET_SUCCESS;
+}
+
+void CDenoiser::BilateralDenoiseLuma (uint8_t* pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride) {
+ int32_t w;
+
+ pSrcY = pSrcY + m_uiSpaceRadius * iStride;
+ for (int32_t h = m_uiSpaceRadius; h < iHeight - m_uiSpaceRadius; h++) {
+ for (w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w += 8) {
+ m_pfDenoise.pfBilateralLumaFilter8 (pSrcY + w, iStride);
+ }
+ for (w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++) {
+ Gauss3x3Filter (pSrcY + w, iStride);
+ }
+ pSrcY += iStride;
+ }
+}
+
+void CDenoiser::WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride) {
+ int32_t w;
+
+ pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
+ for (int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++) {
+ for (w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w += 8) {
+ m_pfDenoise.pfWaverageChromaFilter8 (pSrcUV + w, iStride);
+ }
+
+ for (w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++) {
+ Gauss3x3Filter (pSrcUV + w, iStride);
+ }
+ pSrcUV += iStride;
+ }
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/denoise/denoise.h
@@ -1,0 +1,111 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : denoise.h
+ *
+ * \brief : denoise class of wels video processor class
+ *
+ * \date : 2011/03/15
+ *
+ * \description : 1. rewrite the package code of denoise class
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_DENOISE_H
+#define WELSVP_DENOISE_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+
+#define DENOISE_GRAY_RADIUS (1)
+#define DENOISE_GRAY_SIGMA (2)
+
+#define UV_WINDOWS_RADIUS (2)
+#define TAIL_OF_LINE8 (7)
+
+#define DENOISE_Y_COMPONENT (1)
+#define DENOISE_U_COMPONENT (2)
+#define DENOISE_V_COMPONENT (4)
+#define DENOISE_ALL_COMPONENT (7)
+
+
+WELSVP_NAMESPACE_BEGIN
+
+void Gauss3x3Filter (uint8_t* pixels, int32_t stride);
+
+typedef void (DenoiseFilterFunc) (uint8_t* pixels, int32_t stride);
+
+typedef DenoiseFilterFunc* DenoiseFilterFuncPtr;
+
+DenoiseFilterFunc BilateralLumaFilter8_c;
+DenoiseFilterFunc WaverageChromaFilter8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+DenoiseFilterFunc BilateralLumaFilter8_sse2 ;
+DenoiseFilterFunc WaverageChromaFilter8_sse2 ;
+WELSVP_EXTERN_C_END
+#endif
+
+typedef struct TagDenoiseFuncs {
+ DenoiseFilterFuncPtr pfBilateralLumaFilter8;//on 8 samples
+ DenoiseFilterFuncPtr pfWaverageChromaFilter8;//on 8 samples
+} SDenoiseFuncs;
+
+class CDenoiser : public IStrategy {
+ public:
+ CDenoiser (int32_t iCpuFlag);
+ ~CDenoiser();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* dst);
+
+ private:
+ void InitDenoiseFunc (SDenoiseFuncs& pf, int32_t cpu);
+ void BilateralDenoiseLuma (uint8_t* p_y_data, int32_t width, int32_t height, int32_t stride);
+ void WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t width, int32_t height, int32_t stride);
+
+ private:
+ float_t m_fSigmaGrey; //sigma for grey scale similarity, suggestion 2.5-3
+ uint32_t m_uiFilterWindow; //filter window diameter
+ uint16_t m_uiSpaceRadius; //filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
+ uint16_t m_uiType; //do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
+ uint32_t* m_pGreyWeightTable; //weight table for grey scale
+
+ SDenoiseFuncs m_pfDenoise;
+ int32_t m_CPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/denoise/denoise_filter.cpp
@@ -1,0 +1,127 @@
+/*!
+ * \copy
+ * Copyright (c) 2010-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file svc_preprocess.h
+ *
+ * \brief svc denoising
+ *
+ * \date 4/1/2010 Created
+ *
+ */
+
+#include "denoise.h"
+#include "../common/typedef.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void BilateralLumaFilter8_c (uint8_t* pSample, int32_t iStride) {
+ int32_t nSum = 0, nTotWeight = 0;
+ int32_t iCenterSample = *pSample;
+ uint8_t* pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
+ int32_t x, y;
+ int32_t iCurSample, iCurWeight, iGreyDiff;
+ uint8_t aSample[8];
+
+ for (int32_t i = 0; i < 8; i++) {
+ nSum = 0;
+ nTotWeight = 0;
+ iCenterSample = *pSample;
+ pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
+ for (y = 0; y < 3; y++) {
+ for (x = 0; x < 3; x++) {
+ if (x == 1 && y == 1) continue; // except center point
+ iCurSample = pCurLine[x];
+ iCurWeight = WELS_ABS (iCurSample - iCenterSample);
+ iGreyDiff = 32 - iCurWeight;
+ if (iGreyDiff < 0) continue;
+ else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
+ nSum += iCurSample * iCurWeight;
+ nTotWeight += iCurWeight;
+ }
+ pCurLine += iStride;
+ }
+ nTotWeight = 256 - nTotWeight;
+ nSum += iCenterSample * nTotWeight;
+ aSample[i] = nSum >> 8;
+ pSample++;
+ }
+ WelsMemcpy (pSample - 8, aSample, 8);
+}
+
+
+/***************************************************************************
+5x5 filter:
+1 1 2 1 1
+1 2 4 2 1
+2 4 20 4 2
+1 2 4 2 1
+1 1 2 1 1
+***************************************************************************/
+#define SUM_LINE1(pSample) (pSample[0] +(pSample[1]) +(pSample[2]<<1) + pSample[3] + pSample[4])
+#define SUM_LINE2(pSample) (pSample[0] +(pSample[1]<<1) +(pSample[2]<<2) +(pSample[3]<<1) +pSample[4])
+#define SUM_LINE3(pSample) ((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20) +(pSample[3]<<2) +(pSample[4]<<1))
+void WaverageChromaFilter8_c (uint8_t* pSample, int32_t iStride) {
+ int32_t sum;
+ uint8_t* pStartPixels = pSample - UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
+ uint8_t* pCurLine1 = pStartPixels;
+ uint8_t* pCurLine2 = pCurLine1 + iStride;
+ uint8_t* pCurLine3 = pCurLine2 + iStride;
+ uint8_t* pCurLine4 = pCurLine3 + iStride;
+ uint8_t* pCurLine5 = pCurLine4 + iStride;
+ uint8_t aSample[8];
+
+ for (int32_t i = 0; i < 8; i++) {
+ sum = SUM_LINE1 ((pCurLine1 + i)) + SUM_LINE2 ((pCurLine2 + i)) + SUM_LINE3 ((pCurLine3 + i))
+ + SUM_LINE2 ((pCurLine4 + i)) + SUM_LINE1 ((pCurLine5 + i));
+ aSample[i] = (sum >> 6);
+ pSample++;
+ }
+ WelsMemcpy (pSample - 8, aSample, 8);
+}
+
+/***************************************************************************
+edge of y/uv use a 3x3 Gauss filter, radius = 1:
+1 2 1
+2 4 2
+1 2 1
+***************************************************************************/
+void Gauss3x3Filter (uint8_t* pSrc, int32_t iStride) {
+ int32_t nSum = 0;
+ uint8_t* pCurLine1 = pSrc - iStride - 1;
+ uint8_t* pCurLine2 = pCurLine1 + iStride;
+ uint8_t* pCurLine3 = pCurLine2 + iStride;
+
+ nSum = pCurLine1[0] + (pCurLine1[1] << 1) + pCurLine1[2] +
+ (pCurLine2[0] << 1) + (pCurLine2[1] << 2) + (pCurLine2[2] << 1) +
+ pCurLine3[0] + (pCurLine3[1] << 1) + pCurLine3[2];
+ *pSrc = nSum >> 4;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -1,0 +1,135 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "downsample.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CDownsampling::CDownsampling (int32_t iCpuFlag) {
+ m_iCPUFlag = iCpuFlag;
+ m_eMethod = METHOD_DOWNSAMPLE;
+ WelsMemset (&m_pfDownsample, 0, sizeof (m_pfDownsample));
+ InitDownsampleFuncs (m_pfDownsample, m_iCPUFlag);
+}
+
+CDownsampling::~CDownsampling() {
+}
+
+void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag) {
+ sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
+ sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
+#if defined(X86_ASM)
+ if (iCpuFlag & WELS_CPU_SSE) {
+ /* sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
+ sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
+ sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;*/
+ }
+ if (iCpuFlag & WELS_CPU_SSE2) {
+ // sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
+ // sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
+ }
+ if (iCpuFlag & WELS_CPU_SSSE3) {
+ // sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
+ // sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
+ }
+ if (iCpuFlag & WELS_CPU_SSE41) {
+ // sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
+ // sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
+ }
+#endif//X86_ASM
+
+}
+
+EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
+ int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
+ int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
+ int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
+ int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
+
+ int32_t iSrcWidthUV = iSrcWidthY >> 1;
+ int32_t iSrcHeightUV = iSrcHeightY >> 1;
+ int32_t iDstWidthUV = iDstWidthY >> 1;
+ int32_t iDstHeightUV = iDstHeightY >> 1;
+
+ if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY) {
+ return RET_INVALIDPARAM;
+ }
+
+ if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
+ // use half average functions
+ uint8_t iAlignIndex = 3;
+
+ iAlignIndex = GetAlignedIndex (iSrcWidthY);
+ m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+ (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+ iAlignIndex = GetAlignedIndex (iSrcWidthUV);
+ m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+ (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+ m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+ (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+ } else {
+ m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
+ (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
+
+ m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
+ (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
+
+ m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
+ (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
+ }
+ return RET_SUCCESS;
+}
+
+int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
+ int32_t iAlignIndex = 3;
+ if ((kiSrcWidth & 0x1f) == 0) // x32
+ iAlignIndex = 0;
+ else if ((kiSrcWidth & 0x0f) == 0) // x16
+ iAlignIndex = 1;
+ else if ((kiSrcWidth & 0x07) == 0) // x8
+ iAlignIndex = 2;
+ else
+ iAlignIndex = 3;
+ return iAlignIndex;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/downsample/downsample.h
@@ -1,0 +1,128 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : downsample.h
+ *
+ * \brief : downsample class of wels video processor class
+ *
+ * \date : 2011/03/33
+ *
+ * \description : 1. rewrite the package code of downsample class
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_DOWNSAMPLE_H
+#define WELSVP_DOWNSAMPLE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
+ uint8_t* pSrc, const int32_t kiSrcStride,
+ const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+
+typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+ const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
+
+typedef HalveDownsampleFunc* PHalveDownsampleFunc;
+typedef GeneralDownsampleFunc* PGeneralDownsampleFunc;
+
+HalveDownsampleFunc DyadicBilinearDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
+
+typedef struct {
+ // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
+ PHalveDownsampleFunc pfHalfAverage[4];
+ PGeneralDownsampleFunc pfGeneralRatioLuma;
+ PGeneralDownsampleFunc pfGeneralRatioChroma;
+} SDownsampleFuncs;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+// used for scr width is multipler of 8 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx8_sse;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_sse;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse;
+// used for scr width is multipler of 16 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_ssse3;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_ssse3;
+// iSrcWidth= x16 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_sse4;
+// iSrcWidth= x32 pixels
+HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
+
+GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
+GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
+
+void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+ const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+ const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+ const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
+ const uint32_t kuiScaleX, const uint32_t kuiScaleY);
+WELSVP_EXTERN_C_END
+#endif
+
+
+
+
+class CDownsampling : public IStrategy {
+ public:
+ CDownsampling (int32_t iCpuFlag);
+ ~CDownsampling();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ private:
+ void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
+
+ int32_t GetAlignedIndex (const int32_t kiSrcWidth);
+
+ private:
+ SDownsampleFuncs m_pfDownsample;
+ int32_t m_iCPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/downsample/downsamplefuncs.cpp
@@ -1,0 +1,234 @@
+/*!
+ * \copy
+ * Copyright (c) 2008-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * downsample_yuv.c
+ *
+ * Abstract
+ * Implementation for source yuv data downsampling used before spatial encoding.
+ *
+ * History
+ * 10/24/2008 Created
+ *
+ *****************************************************************************/
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+#include "downsample.h"
+
+
+WELSVP_NAMESPACE_BEGIN
+
+
+void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
+ uint8_t* pSrc, const int32_t kiSrcStride,
+ const int32_t kiSrcWidth, const int32_t kiSrcHeight)
+
+{
+ uint8_t* pDstLine = pDst;
+ uint8_t* pSrcLine = pSrc;
+ const int32_t kiSrcStridex2 = kiSrcStride << 1;
+ const int32_t kiDstWidth = kiSrcWidth >> 1;
+ const int32_t kiDstHeight = kiSrcHeight >> 1;
+
+ for (int32_t j = 0; j < kiDstHeight; j ++) {
+ for (int32_t i = 0; i < kiDstWidth; i ++) {
+ const int32_t kiSrcX = i << 1;
+ const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
+ const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
+
+ pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
+ }
+ pDstLine += kiDstStride;
+ pSrcLine += kiSrcStridex2;
+ }
+}
+
+void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+ const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+ const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
+ const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
+ int32_t fScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+ int32_t fScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+ uint32_t x;
+ int32_t iYInverse, iXInverse;
+
+ uint8_t* pByDst = pDst;
+ uint8_t* pByLineDst = pDst;
+
+ iYInverse = 1 << (kuiScaleBitHeight - 1);
+ for (int32_t i = 0; i < kiDstHeight - 1; i++) {
+ int32_t iYy = iYInverse >> kuiScaleBitHeight;
+ int32_t fv = iYInverse & (kuiScaleHeight - 1);
+
+ uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+ pByDst = pByLineDst;
+ iXInverse = 1 << (kuiScaleBitWidth - 1);
+ for (int32_t j = 0; j < kiDstWidth - 1; j++) {
+ int32_t iXx = iXInverse >> kuiScaleBitWidth;
+ int32_t iFu = iXInverse & (kuiScaleWidth - 1);
+
+ uint8_t* pByCurrent = pBySrc + iXx;
+ uint8_t a, b, c, d;
+
+ a = *pByCurrent;
+ b = * (pByCurrent + 1);
+ c = * (pByCurrent + kiSrcStride);
+ d = * (pByCurrent + kiSrcStride + 1);
+
+ x = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
+ x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
+ x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
+ x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
+ x >>= (kuiScaleBitHeight - 1);
+ x += 1;
+ x >>= 1;
+ //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c +
+ // ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
+ x = WELS_CLAMP (x, 0, 255);
+ *pByDst++ = (uint8_t)x;
+
+ iXInverse += fScalex;
+ }
+ *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth));
+ pByLineDst += kiDstStride;
+ iYInverse += fScaley;
+ }
+
+ // last row special
+ {
+ int32_t iYy = iYInverse >> kuiScaleBitHeight;
+ uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+ pByDst = pByLineDst;
+ iXInverse = 1 << (kuiScaleBitWidth - 1);
+ for (int32_t j = 0; j < kiDstWidth; j++) {
+ int32_t iXx = iXInverse >> kuiScaleBitWidth;
+ *pByDst++ = * (pBySrc + iXx);
+
+ iXInverse += fScalex;
+ }
+ }
+}
+
+void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+ const int32_t kiDstHeight,
+ uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+ const int32_t kiScaleBit = 15;
+ const int32_t kiScale = (1 << kiScaleBit);
+ int32_t iScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
+ int32_t iScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
+ int64_t x;
+ int32_t iYInverse, iXInverse;
+
+ uint8_t* pByDst = pDst;
+ uint8_t* pByLineDst = pDst;
+
+ iYInverse = 1 << (kiScaleBit - 1);
+ for (int32_t i = 0; i < kiDstHeight - 1; i++) {
+ int32_t iYy = iYInverse >> kiScaleBit;
+ int32_t iFv = iYInverse & (kiScale - 1);
+
+ uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+ pByDst = pByLineDst;
+ iXInverse = 1 << (kiScaleBit - 1);
+ for (int32_t j = 0; j < kiDstWidth - 1; j++) {
+ int32_t iXx = iXInverse >> kiScaleBit;
+ int32_t iFu = iXInverse & (kiScale - 1);
+
+ uint8_t* pByCurrent = pBySrc + iXx;
+ uint8_t a, b, c, d;
+
+ a = *pByCurrent;
+ b = * (pByCurrent + 1);
+ c = * (pByCurrent + kiSrcStride);
+ d = * (pByCurrent + kiSrcStride + 1);
+
+ x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
+ kiScale - 1 - iFu)) * iFv * c +
+ ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
+ x = WELS_CLAMP (x, 0, 255);
+ *pByDst++ = (uint8_t)x;
+
+ iXInverse += iScalex;
+ }
+ *pByDst = * (pBySrc + (iXInverse >> kiScaleBit));
+ pByLineDst += kiDstStride;
+ iYInverse += iScaley;
+ }
+
+ // last row special
+ {
+ int32_t iYy = iYInverse >> kiScaleBit;
+ uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
+
+ pByDst = pByLineDst;
+ iXInverse = 1 << (kiScaleBit - 1);
+ for (int32_t j = 0; j < kiDstWidth; j++) {
+ int32_t iXx = iXInverse >> kiScaleBit;
+ *pByDst++ = * (pBySrc + iXx);
+
+ iXInverse += iScalex;
+ }
+ }
+}
+
+
+#ifdef X86_ASM
+//void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+// const int32_t kiDstHeight,
+// uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+// const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
+// const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
+//
+// uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
+// uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
+//
+// GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+// pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+//}
+//
+//void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
+// const int32_t kiDstHeight,
+// uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
+// const int32_t kiScaleBit = 15;
+// const uint32_t kuiScale = (1 << kiScaleBit);
+//
+// uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
+// uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
+//
+// GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
+// pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
+//}
+#endif //X86_ASM
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/imagerotate/imagerotate.cpp
@@ -1,0 +1,93 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CImageRotating::CImageRotating (int32_t iCpuFlag) {
+ m_iCPUFlag = iCpuFlag;
+ m_eMethod = METHOD_IMAGE_ROTATE;
+ WelsMemset (&m_pfRotateImage, 0, sizeof (m_pfRotateImage));
+ InitImageRotateFuncs (m_pfRotateImage, m_iCPUFlag);
+}
+
+CImageRotating::~CImageRotating() {
+}
+
+void CImageRotating::InitImageRotateFuncs (SImageRotateFuncs& sImageRotateFuncs, int32_t iCpuFlag) {
+ sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
+ sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
+ sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
+}
+EResult CImageRotating::ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth,
+ uint32_t iHeight, uint8_t* pDst) {
+ if (iType == 90) {
+ m_pfRotateImage.pfImageRotate90D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+ } else if (iType == 180) {
+ m_pfRotateImage.pfImageRotate180D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+ } else if (iType == 270) {
+ m_pfRotateImage.pfImageRotate270D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
+ } else {
+ return RET_NOTSUPPORTED;
+ }
+ return RET_SUCCESS;
+}
+
+EResult CImageRotating::Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) {
+ EResult eReturn = RET_INVALIDPARAM;
+
+ if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
+ (pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
+ (pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
+ (pSrc->eFormat == VIDEO_FORMAT_ARGB)) {
+ eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
+ pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
+ } else if (pSrc->eFormat == VIDEO_FORMAT_I420) {
+ ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
+ pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
+ ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[1], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
+ (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[1]);
+ eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[2], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
+ (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[2]);
+ } else {
+ eReturn = RET_NOTSUPPORTED;
+ }
+
+ return eReturn;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/imagerotate/imagerotate.h
@@ -1,0 +1,85 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : downsample.h
+ *
+ * \brief : image rotate class of wels video processor class
+ *
+ * \date : 2011/04/06
+ *
+ * \description :
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_IMAGEROTATE_H
+#define WELSVP_IMAGEROTATE_H
+
+#include "../common/util.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (ImageRotateFunc) (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
+ uint8_t* pDst);
+
+typedef ImageRotateFunc* ImageRotateFuncPtr;
+
+ImageRotateFunc ImageRotate90D_c;
+ImageRotateFunc ImageRotate180D_c;
+ImageRotateFunc ImageRotate270D_c;
+
+typedef struct {
+ ImageRotateFuncPtr pfImageRotate90D;
+ ImageRotateFuncPtr pfImageRotate180D;
+ ImageRotateFuncPtr pfImageRotate270D;
+} SImageRotateFuncs;
+
+class CImageRotating : public IStrategy {
+ public:
+ CImageRotating (int32_t iCpuFlag);
+ ~CImageRotating();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
+
+ private:
+ void InitImageRotateFuncs (SImageRotateFuncs& pf, int32_t iCpuFlag);
+ EResult ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
+ uint8_t* pDst);
+
+ private:
+ SImageRotateFuncs m_pfRotateImage;
+ int32_t m_iCPUFlag;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/imagerotate/imagerotatefuncs.cpp
@@ -1,0 +1,66 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * image_rotate.c
+ *
+ * Created on 11-2-21.
+ *
+ */
+
+#include "imagerotate.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void ImageRotate90D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+ for (uint32_t j = 0; j < iHeight; j++) {
+ for (uint32_t i = 0; i < iWidth; i++) {
+ for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+ pDst[ (i * iHeight + iHeight - 1 - j)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
+ }
+ }
+}
+void ImageRotate180D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+ for (uint32_t j = 0; j < iHeight; j++) {
+ for (uint32_t i = 0; i < iWidth; i++) {
+ for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+ pDst[ ((iHeight - 1 - j)*iWidth + iWidth - 1 - i)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
+ }
+ }
+}
+void ImageRotate270D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
+ for (uint32_t j = 0; j < iWidth; j++) {
+ for (uint32_t i = 0; i < iHeight; i++) {
+ for (uint32_t n = 0; n < uiBytesPerPixel; n++)
+ pDst[ ((iWidth - 1 - j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[ (iWidth * i + j) * uiBytesPerPixel + n];
+ }
+ }
+}
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.cpp
@@ -1,0 +1,136 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetection.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+#define HIGH_MOTION_BLOCK_THRESHOLD 320
+#define SCENE_CHANGE_MOTION_RATIO 0.85f
+
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CSceneChangeDetection::CSceneChangeDetection (int32_t iCpuFlag) {
+ m_iCpuFlag = iCpuFlag;
+ m_eMethod = METHOD_SCENE_CHANGE_DETECTION;
+ m_pfSad = NULL;
+ WelsMemset (&m_sSceneChangeParam, 0, sizeof (m_sSceneChangeParam));
+ InitSadFuncs (m_pfSad, m_iCpuFlag);
+}
+
+CSceneChangeDetection::~CSceneChangeDetection() {
+}
+
+EResult CSceneChangeDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ EResult eReturn = RET_INVALIDPARAM;
+
+ int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
+ int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
+ int32_t iBlock8x8Width = iWidth >> 3;
+ int32_t iBlock8x8Height = iHeight >> 3;
+ int32_t iBlock8x8Num = iBlock8x8Width * iBlock8x8Height;
+ int32_t iSceneChangeThreshold = WelsStaticCast (int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
+
+ int32_t iBlockSad = 0;
+ int32_t iMotionBlockNum = 0;
+
+ uint8_t* pRefY = NULL, *pCurY = NULL;
+ int32_t iRefStride = 0, iCurStride = 0;
+ int32_t iRefRowStride = 0, iCurRowStride = 0;
+
+ uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
+
+ pRefY = (uint8_t*)pRefPixMap->pPixel[0];
+ pCurY = (uint8_t*)pSrcPixMap->pPixel[0];
+
+ iRefStride = pRefPixMap->iStride[0];
+ iCurStride = pSrcPixMap->iStride[0];
+
+ iRefRowStride = pRefPixMap->iStride[0] << 3;
+ iCurRowStride = pSrcPixMap->iStride[0] << 3;
+
+ m_sSceneChangeParam.bSceneChangeFlag = 0;
+
+ for (int32_t j = 0; j < iBlock8x8Height; j ++) {
+ pRefTmp = pRefY;
+ pCurTmp = pCurY;
+
+ for (int32_t i = 0; i < iBlock8x8Width; i++) {
+ iBlockSad = m_pfSad (pRefTmp, iRefStride, pCurTmp, iCurStride);
+
+ iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
+
+ pRefTmp += 8;
+ pCurTmp += 8;
+ }
+
+ pRefY += iRefRowStride;
+ pCurY += iCurRowStride;
+ }
+
+ if (iMotionBlockNum >= iSceneChangeThreshold) {
+ m_sSceneChangeParam.bSceneChangeFlag = 1;
+ }
+
+ eReturn = RET_SUCCESS;
+
+ return eReturn;
+}
+
+
+EResult CSceneChangeDetection::Get (int32_t iType, void* pParam) {
+ if (pParam == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ * (SSceneChangeResult*)pParam = m_sSceneChangeParam;
+
+ return RET_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////
+
+void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad, int32_t iCpuFlag) {
+ pfSad = WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+ if (iCpuFlag & WELS_CPU_SSE2) {
+ pfSad = WelsSampleSad8x8_sse21;
+ }
+#endif
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetection.h
@@ -1,0 +1,72 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+* \file : SceneChangeDetection.h
+*
+* \brief : scene change detection class of wels video processor class
+*
+* \date : 2011/03/14
+*
+* \description : 1. rewrite the package code of scene change detection class
+*
+*************************************************************************************
+*/
+
+#ifndef WELSVP_SCENECHANGEDETECTION_H
+#define WELSVP_SCENECHANGEDETECTION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+#include "SceneChangeDetectionCommon.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+class CSceneChangeDetection : public IStrategy {
+ public:
+ CSceneChangeDetection (int32_t iCpuFlag);
+ ~CSceneChangeDetection();
+
+ EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
+ EResult Get (int32_t iType, void* pParam);
+
+ private:
+ void InitSadFuncs (SadFuncPtr& pfSadFunc, int32_t iCpuFlag);
+
+ private:
+ SadFuncPtr m_pfSad;
+ int32_t m_iCpuFlag;
+ SSceneChangeResult m_sSceneChangeParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
@@ -1,0 +1,60 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "SceneChangeDetectionCommon.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+int32_t WelsSampleSad8x8_c (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY) {
+ int32_t iSadSum = 0;
+ uint8_t* pSrcA = pSrcY;
+ uint8_t* pSrcB = pRefY;
+ for (int32_t i = 0; i < 8; i++) {
+ iSadSum += WELS_ABS ((pSrcA[0] - pSrcB[0]));
+ iSadSum += WELS_ABS ((pSrcA[1] - pSrcB[1]));
+ iSadSum += WELS_ABS ((pSrcA[2] - pSrcB[2]));
+ iSadSum += WELS_ABS ((pSrcA[3] - pSrcB[3]));
+ iSadSum += WELS_ABS ((pSrcA[4] - pSrcB[4]));
+ iSadSum += WELS_ABS ((pSrcA[5] - pSrcB[5]));
+ iSadSum += WELS_ABS ((pSrcA[6] - pSrcB[6]));
+ iSadSum += WELS_ABS ((pSrcA[7] - pSrcB[7]));
+
+ pSrcA += iSrcStrideY;
+ pSrcB += iRefStrideY;
+ }
+
+ return iSadSum;
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
@@ -1,0 +1,65 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : SceneChangeDetectionCommon.h
+ *
+ * \brief : scene change detection class of wels video processor class
+ *
+ * \date : 2011/03/14
+ *
+ * \description : 1. rewrite the package code of scene change detection class
+ *
+ */
+
+#ifndef WELSVP_SCENECHANGEDETECTIONCOMMON_H
+#define WELSVP_SCENECHANGEDETECTIONCOMMON_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
+
+typedef SadFunc* SadFuncPtr;
+
+SadFunc WelsSampleSad8x8_c;
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+SadFunc WelsSampleSad8x8_sse21;
+WELSVP_EXTERN_C_END
+#endif
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/src/vaacalc/vaacalcfuncs.cpp
@@ -1,0 +1,595 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "../common/typedef.h"
+#include "../common/util.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+void VAACalcSadSsd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16) {
+ uint8_t* tmp_ref = pRefData;
+ uint8_t* tmp_cur = pCurData;
+ int32_t iMbWidth = (iPicWidth >> 4);
+ int32_t mb_heigth = (iPicHeight >> 4);
+ int32_t mb_index = 0;
+ int32_t pic_stride_x8 = iPicStride << 3;
+ int32_t step = (iPicStride << 4) - iPicWidth;
+
+ *pFrameSad = 0;
+ for (int32_t i = 0; i < mb_heigth; i ++) {
+ for (int32_t j = 0; j < iMbWidth; j ++) {
+ int32_t k, l;
+ int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
+ uint8_t* tmp_cur_row;
+ uint8_t* tmp_ref_row;
+
+ pSum16x16[mb_index] = 0;
+ psqsum16x16[mb_index] = 0;
+ psqdiff16x16[mb_index] = 0;
+
+ l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur;
+ tmp_ref_row = tmp_ref;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sqdiff += diff * diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 0] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+
+ l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + 8;
+ tmp_ref_row = tmp_ref + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sqdiff += diff * diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 1] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+
+ l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8;
+ tmp_ref_row = tmp_ref + pic_stride_x8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sqdiff += diff * diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 2] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+
+ l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+ tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sqdiff += diff * diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 3] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+
+
+ tmp_ref += 16;
+ tmp_cur += 16;
+ ++mb_index;
+ }
+ tmp_ref += step;
+ tmp_cur += step;
+ }
+}
+void VAACalcSadVar_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16) {
+ uint8_t* tmp_ref = pRefData;
+ uint8_t* tmp_cur = pCurData;
+ int32_t iMbWidth = (iPicWidth >> 4);
+ int32_t mb_heigth = (iPicHeight >> 4);
+ int32_t mb_index = 0;
+ int32_t pic_stride_x8 = iPicStride << 3;
+ int32_t step = (iPicStride << 4) - iPicWidth;
+
+ *pFrameSad = 0;
+ for (int32_t i = 0; i < mb_heigth; i ++) {
+ for (int32_t j = 0; j < iMbWidth; j ++) {
+ int32_t k, l;
+ int32_t l_sad, l_sum, l_sqsum;
+ uint8_t* tmp_cur_row;
+ uint8_t* tmp_ref_row;
+
+ pSum16x16[mb_index] = 0;
+ psqsum16x16[mb_index] = 0;
+
+ l_sad = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur;
+ tmp_ref_row = tmp_ref;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 0] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+
+ l_sad = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + 8;
+ tmp_ref_row = tmp_ref + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 1] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+
+ l_sad = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8;
+ tmp_ref_row = tmp_ref + pic_stride_x8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 2] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+
+ l_sad = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+ tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 3] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+
+
+ tmp_ref += 16;
+ tmp_cur += 16;
+ ++mb_index;
+ }
+ tmp_ref += step;
+ tmp_cur += step;
+ }
+}
+
+
+void VAACalcSad_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8) {
+ uint8_t* tmp_ref = pRefData;
+ uint8_t* tmp_cur = pCurData;
+ int32_t iMbWidth = (iPicWidth >> 4);
+ int32_t mb_heigth = (iPicHeight >> 4);
+ int32_t mb_index = 0;
+ int32_t pic_stride_x8 = iPicStride << 3;
+ int32_t step = (iPicStride << 4) - iPicWidth;
+
+ *pFrameSad = 0;
+ for (int32_t i = 0; i < mb_heigth; i ++) {
+ for (int32_t j = 0; j < iMbWidth; j ++) {
+ int32_t k, l;
+ int32_t l_sad;
+ uint8_t* tmp_cur_row;
+ uint8_t* tmp_ref_row;
+
+ l_sad = 0;
+ tmp_cur_row = tmp_cur;
+ tmp_ref_row = tmp_ref;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 0] = l_sad;
+
+ l_sad = 0;
+ tmp_cur_row = tmp_cur + 8;
+ tmp_ref_row = tmp_ref + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 1] = l_sad;
+
+ l_sad = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8;
+ tmp_ref_row = tmp_ref + pic_stride_x8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 2] = l_sad;
+
+ l_sad = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+ tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
+ l_sad += diff;
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 3] = l_sad;
+
+ tmp_ref += 16;
+ tmp_cur += 16;
+ ++mb_index;
+ }
+ tmp_ref += step;
+ tmp_cur += step;
+ }
+}
+
+void VAACalcSadSsdBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+ int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16, int32_t* pSd8x8,
+ uint8_t* pMad8x8)
+
+{
+ uint8_t* tmp_ref = pRefData;
+ uint8_t* tmp_cur = pCurData;
+ int32_t iMbWidth = (iPicWidth >> 4);
+ int32_t mb_heigth = (iPicHeight >> 4);
+ int32_t mb_index = 0;
+ int32_t pic_stride_x8 = iPicStride << 3;
+ int32_t step = (iPicStride << 4) - iPicWidth;
+
+ *pFrameSad = 0;
+ for (int32_t i = 0; i < mb_heigth; i ++) {
+ for (int32_t j = 0; j < iMbWidth; j ++) {
+ int32_t k, l;
+ int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
+ uint8_t* tmp_cur_row;
+ uint8_t* tmp_ref_row;
+
+ pSum16x16[mb_index] = 0;
+ psqsum16x16[mb_index] = 0;
+ psqdiff16x16[mb_index] = 0;
+
+ l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur;
+ tmp_ref_row = tmp_ref;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+
+ l_sd += diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ l_sad += abs_diff;
+ l_sqdiff += abs_diff * abs_diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 0] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+ pSd8x8[ (mb_index << 2) + 0] = l_sd;
+ pMad8x8[ (mb_index << 2) + 0] = l_mad;
+
+
+ l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + 8;
+ tmp_ref_row = tmp_ref + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+
+ l_sd += diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ l_sad += abs_diff;
+ l_sqdiff += abs_diff * abs_diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 1] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+ pSd8x8[ (mb_index << 2) + 1] = l_sd;
+ pMad8x8[ (mb_index << 2) + 1] = l_mad;
+
+ l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8;
+ tmp_ref_row = tmp_ref + pic_stride_x8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+
+ l_sd += diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ l_sad += abs_diff;
+ l_sqdiff += abs_diff * abs_diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 2] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+ pSd8x8[ (mb_index << 2) + 2] = l_sd;
+ pMad8x8[ (mb_index << 2) + 2] = l_mad;
+
+ l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+ tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+
+ l_sd += diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ l_sad += abs_diff;
+ l_sqdiff += abs_diff * abs_diff;
+ l_sum += tmp_cur_row[l];
+ l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 3] = l_sad;
+ pSum16x16[mb_index] += l_sum;
+ psqsum16x16[mb_index] += l_sqsum;
+ psqdiff16x16[mb_index] += l_sqdiff;
+ pSd8x8[ (mb_index << 2) + 3] = l_sd;
+ pMad8x8[ (mb_index << 2) + 3] = l_mad;
+
+ tmp_ref += 16;
+ tmp_cur += 16;
+ ++mb_index;
+ }
+ tmp_ref += step;
+ tmp_cur += step;
+ }
+}
+
+void VAACalcSadBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8) {
+ uint8_t* tmp_ref = pRefData;
+ uint8_t* tmp_cur = pCurData;
+ int32_t iMbWidth = (iPicWidth >> 4);
+ int32_t mb_heigth = (iPicHeight >> 4);
+ int32_t mb_index = 0;
+ int32_t pic_stride_x8 = iPicStride << 3;
+ int32_t step = (iPicStride << 4) - iPicWidth;
+
+ *pFrameSad = 0;
+ for (int32_t i = 0; i < mb_heigth; i ++) {
+ for (int32_t j = 0; j < iMbWidth; j ++) {
+ int32_t k, l;
+ int32_t l_sad, l_sd, l_mad;
+ uint8_t* tmp_cur_row;
+ uint8_t* tmp_ref_row;
+
+ l_mad = l_sd = l_sad = 0;
+ tmp_cur_row = tmp_cur;
+ tmp_ref_row = tmp_ref;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+ l_sd += diff;
+ l_sad += abs_diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 0] = l_sad;
+ pSd8x8[ (mb_index << 2) + 0] = l_sd;
+ pMad8x8[ (mb_index << 2) + 0] = l_mad;
+
+ l_mad = l_sd = l_sad = 0;
+ tmp_cur_row = tmp_cur + 8;
+ tmp_ref_row = tmp_ref + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+ l_sd += diff;
+ l_sad += abs_diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 1] = l_sad;
+ pSd8x8[ (mb_index << 2) + 1] = l_sd;
+ pMad8x8[ (mb_index << 2) + 1] = l_mad;
+
+ l_mad = l_sd = l_sad = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8;
+ tmp_ref_row = tmp_ref + pic_stride_x8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+ l_sd += diff;
+ l_sad += abs_diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 2] = l_sad;
+ pSd8x8[ (mb_index << 2) + 2] = l_sd;
+ pMad8x8[ (mb_index << 2) + 2] = l_mad;
+
+ l_mad = l_sd = l_sad = 0;
+ tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
+ tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
+ for (k = 0; k < 8; k ++) {
+ for (l = 0; l < 8; l ++) {
+ int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
+ int32_t abs_diff = WELS_ABS (diff);
+ l_sd += diff;
+ l_sad += abs_diff;
+ if (abs_diff > l_mad) {
+ l_mad = abs_diff;
+ }
+ }
+ tmp_cur_row += iPicStride;
+ tmp_ref_row += iPicStride;
+ }
+ *pFrameSad += l_sad;
+ pSad8x8[ (mb_index << 2) + 3] = l_sad;
+ pSd8x8[ (mb_index << 2) + 3] = l_sd;
+ pMad8x8[ (mb_index << 2) + 3] = l_mad;
+
+ tmp_ref += 16;
+ tmp_cur += 16;
+ ++mb_index;
+ }
+ tmp_ref += step;
+ tmp_cur += step;
+ }
+}
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -1,0 +1,123 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "vaacalculation.h"
+#include "../common/cpu.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CVAACalculation::CVAACalculation (int32_t iCpuFlag) {
+ m_iCPUFlag = iCpuFlag;
+ m_eMethod = METHOD_VAA_STATISTICS;
+
+ WelsMemset (&m_sCalcParam, 0, sizeof (m_sCalcParam));
+ WelsMemset (&m_sVaaFuncs, 0, sizeof (m_sVaaFuncs));
+ InitVaaFuncs (m_sVaaFuncs, m_iCPUFlag);
+}
+
+CVAACalculation::~CVAACalculation() {
+}
+
+void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
+ sVaaFuncs.pfVAACalcSad = VAACalcSad_c;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_c;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_c;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_c;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_c;
+#ifdef X86_ASM
+ if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
+ /* sVaaFuncs.pfVAACalcSad = VAACalcSad_sse2;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_sse2;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_sse2;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;*/
+ }
+#endif//X86_ASM
+}
+
+EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
+ uint8_t* pCurData = (uint8_t*)pSrcPixMap->pPixel[0];
+ uint8_t* pRefData = (uint8_t*)pRefPixMap->pPixel[0];
+ int32_t iPicWidth = pSrcPixMap->sRect.iRectWidth;
+ int32_t iPicHeight = pSrcPixMap->sRect.iRectHeight;
+ int32_t iPicStride = pSrcPixMap->iStride[0];
+
+ SVAACalcResult* pResult = m_sCalcParam.pCalcResult;
+
+ if (pCurData == NULL || pRefData == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ pResult->pCurY = pCurData;
+ pResult->pRefY = pRefData;
+ if (m_sCalcParam.iCalcBgd) {
+ if (m_sCalcParam.iCalcSsd) {
+ m_sVaaFuncs.pfVAACalcSadSsdBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+ (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16,
+ (int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
+ } else {
+ m_sVaaFuncs.pfVAACalcSadBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+ (int32_t*) (pResult->pSad8x8), (int32_t*) (pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
+ }
+ } else {
+ if (m_sCalcParam.iCalcSsd) {
+ m_sVaaFuncs.pfVAACalcSadSsd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+ (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
+ } else {
+ if (m_sCalcParam.iCalcVar) {
+ m_sVaaFuncs.pfVAACalcSadVar (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+ (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
+ } else {
+ m_sVaaFuncs.pfVAACalcSad (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
+ (int32_t*)pResult->pSad8x8);
+ }
+ }
+ }
+
+ return RET_SUCCESS;
+}
+
+EResult CVAACalculation::Set (int32_t iType, void* pParam) {
+ if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL) {
+ return RET_INVALIDPARAM;
+ }
+
+ m_sCalcParam = * (SVAACalcParam*)pParam;
+
+ return RET_SUCCESS;
+}
+
+
+WELSVP_NAMESPACE_END
--- /dev/null
+++ b/codec/processing/src/vaacalc/vaacalculation.h
@@ -1,0 +1,125 @@
+/*!
+ * \copy
+ * Copyright (c) 2011-2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * \file : vaacalculation.h
+ *
+ * \brief : pVaa calculation class of wels video processor class
+ *
+ * \date : 2011/03/18
+ *
+ * \description : 1. rewrite the package code of pVaa calculation class
+ *
+ *************************************************************************************
+ */
+
+#ifndef WELSVP_VAACALCULATION_H
+#define WELSVP_VAACALCULATION_H
+
+#include "../common/util.h"
+#include "../common/memory.h"
+#include "../common/WelsFrameWork.h"
+#include "../../interface/IWelsVP.h"
+
+WELSVP_NAMESPACE_BEGIN
+
+typedef void (VAACalcSadBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+ int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8);
+
+typedef void (VAACalcSadSsdBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+ int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16,
+ int32_t* pSsd16x16, int32_t* pSd8x8, uint8_t* pMad8x8);
+
+typedef void (VAACalcSadFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+ int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8);
+
+typedef void (VAACalcSadVarFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+ int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16);
+
+typedef void (VAACalcSadSsdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
+ int32_t iPicStride,
+ int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16, int32_t* pSsd16x16);
+
+
+typedef VAACalcSadBgdFunc* PVAACalcSadBgdFunc;
+typedef VAACalcSadSsdBgdFunc* PVAACalcSadSsdBgdFunc;
+typedef VAACalcSadFunc* PVAACalcSadFunc;
+typedef VAACalcSadVarFunc* PVAACalcSadVarFunc;
+typedef VAACalcSadSsdFunc* PVAACalcSadSsdFunc;
+
+typedef struct TagVaaFuncs {
+ PVAACalcSadBgdFunc pfVAACalcSadBgd;
+ PVAACalcSadSsdBgdFunc pfVAACalcSadSsdBgd;
+ PVAACalcSadFunc pfVAACalcSad;
+ PVAACalcSadVarFunc pfVAACalcSadVar;
+ PVAACalcSadSsdFunc pfVAACalcSadSsd;
+} SVaaFuncs;
+
+
+VAACalcSadBgdFunc VAACalcSadBgd_c;
+VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_c;
+VAACalcSadFunc VAACalcSad_c;
+VAACalcSadVarFunc VAACalcSadVar_c;
+VAACalcSadSsdFunc VAACalcSadSsd_c;
+
+
+#ifdef X86_ASM
+WELSVP_EXTERN_C_BEGIN
+VAACalcSadBgdFunc VAACalcSadBgd_sse2;
+VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_sse2;
+VAACalcSadFunc VAACalcSad_sse2;
+VAACalcSadVarFunc VAACalcSadVar_sse2;
+VAACalcSadSsdFunc VAACalcSadSsd_sse2;
+WELSVP_EXTERN_C_END
+#endif
+
+class CVAACalculation : public IStrategy {
+ public:
+ CVAACalculation (int32_t iCpuFlag);
+ ~CVAACalculation();
+
+ EResult Process (int32_t iType, SPixMap* pCurPixMap, SPixMap* pRefPixMap);
+ EResult Set (int32_t iType, void* pParam);
+
+ private:
+ void InitVaaFuncs (SVaaFuncs& sVaaFunc, int32_t iCpuFlag);
+
+ private:
+ SVaaFuncs m_sVaaFuncs;
+ int32_t m_iCPUFlag;
+ SVAACalcParam m_sCalcParam;
+};
+
+WELSVP_NAMESPACE_END
+
+#endif
--- /dev/null
+++ b/codec/processing/targets.mk
@@ -1,0 +1,114 @@
+PROCESSING_PREFIX=PROCESSING
+PROCESSING_SRCDIR=codec/processing
+PROCESSING_CPP_SRCS=\
+ $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp\
+ $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp\
+ $(PROCESSING_SRCDIR)/./src/common/cpu.cpp\
+ $(PROCESSING_SRCDIR)/./src/common/memory.cpp\
+ $(PROCESSING_SRCDIR)/./src/common/thread.cpp\
+ $(PROCESSING_SRCDIR)/./src/common/util.cpp\
+ $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp\
+ $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp\
+ $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp\
+ $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp\
+ $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp\
+ $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp\
+ $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp\
+ $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp\
+ $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp\
+ $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp\
+ $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp\
+ $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp\
+ $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp\
+
+PROCESSING_OBJS += $(PROCESSING_CPP_SRCS:.cpp=.o)
+ifeq ($(USE_ASM), Yes)
+PROCESSING_ASM_SRCS=\
+ $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
+ $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
+ $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
+ $(PROCESSING_SRCDIR)/./src/asm/sad.asm\
+ $(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
+
+PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)
+endif
+
+OBJS += $(PROCESSING_OBJS)
+$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o: $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
+
+$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o: $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/cpu.o: $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/cpu.o $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/memory.o: $(PROCESSING_SRCDIR)/./src/common/memory.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/memory.o $(PROCESSING_SRCDIR)/./src/common/memory.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/thread.o: $(PROCESSING_SRCDIR)/./src/common/thread.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/thread.o $(PROCESSING_SRCDIR)/./src/common/thread.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/util.o: $(PROCESSING_SRCDIR)/./src/common/util.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/util.o $(PROCESSING_SRCDIR)/./src/common/util.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
+
+$(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
+
+$(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o: $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
+
+$(PROCESSING_SRCDIR)/./src/denoise/denoise.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise.o $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
+
+$(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
+
+$(PROCESSING_SRCDIR)/./src/downsample/downsample.o: $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsample.o $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
+
+$(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o: $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
+
+$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
+
+$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
+
+$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
+
+$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
+
+$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
+
+$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
+ $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
+
+$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o: $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o: $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/intra_pred.o: $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/sad.o: $(PROCESSING_SRCDIR)/./src/asm/sad.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/sad.o $(PROCESSING_SRCDIR)/./src/asm/sad.asm
+
+$(PROCESSING_SRCDIR)/./src/asm/vaa.o: $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
+ $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/vaa.o $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
+
+$(LIBPREFIX)processing.$(LIBSUFFIX): $(PROCESSING_OBJS)
+ rm -f $(LIBPREFIX)processing.$(LIBSUFFIX)
+ $(AR) cr $@ $(PROCESSING_OBJS)
+
+libraries: $(LIBPREFIX)processing.$(LIBSUFFIX)
+LIBRARIES += $(LIBPREFIX)processing.$(LIBSUFFIX)
--- a/processing/build/linux/makefile
+++ /dev/null
@@ -1,94 +1,0 @@
-NASM = 1
-NAME = libwelsvp
-
-OUTDIR = ../../../bin/linux
-BINDIR = ../../bin
-OBJDIR = ../../obj
-SRCDIRS = ../../src/asm \
- ../../src/common \
- ../../src/adaptivequantization \
- ../../src/backgounddetection \
- ../../src/denoise \
- ../../src/downsample \
- ../../src/scenechangedetection \
- ../../src/vaacalc \
- ../../src/complexityanalysis
-SRCDIRS += ../../src/imagerotate
-
-
-TARGETLIB = $(BINDIR)/$(NAME).so
-
-CC = $(shell which gcc)
-AS = $(shell which nasm)
-GCC = gcc -m32
-
-CPPFLAGS = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS = -f elf -DNOPREFIX -I ../../src/asm/
-LDFLAGS = -lstdc++ -ldl
-
-SRCEXTS = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS = .h
-SOURCES = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP = $(filter %.cpp,$(SOURCES))
-SRC_ASM = $(filter %.asm,$(SOURCES))
-OBJS = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS = $(OBJS:.o=.d)
-
-DEP_OPT = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
- echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm = $(AS) $(ASMFLAGS)
-LINK = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-
-%.d:%.cpp
- @echo -n $(dir $<) > $@
- @$(DEPEND_cpp.d) $< >> $@
-
-%.d:%.asm
- @echo -n $(dir $<) > $@
- @$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
- $(COMPILE.cpp) $< -o $@
-
-%.o:%.asm
- $(COMPILE.asm) $< -o $@
-
-tags: $(HEADERS) $(SOURCES)
- etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
- ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
- @if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
- $(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
- @echo produce the lib to $(TARGETLIB).
- @if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
- @cp -f $(TARGETLIB) $(OUTDIR)
- @cp -f $(TARGETLIB) ../../../testbin
- @echo copy the lib to $(OUTDIR).
-
-clean:
- rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
- rm -f $(DEPS) TAGS
-
--- a/processing/build/win32/WelsVP_2008.sln
+++ /dev/null
@@ -1,20 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual Studio 2008
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP_2008.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
- GlobalSection(SolutionConfigurationPlatforms) = preSolution
- Debug|Win32 = Debug|Win32
- Release|Win32 = Release|Win32
- EndGlobalSection
- GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
- EndGlobalSection
- GlobalSection(SolutionProperties) = preSolution
- HideSolutionNode = FALSE
- EndGlobalSection
-EndGlobal
binary files a/processing/build/win32/WelsVP_2008.suo /dev/null differ
--- a/processing/build/win32/WelsVP_2008.vcproj
+++ /dev/null
@@ -1,900 +1,0 @@
-<?xml version="1.0" encoding="gb2312"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="9.00"
- Name="WelsVP"
- ProjectGUID="{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
- RootNamespace="WelsVP"
- Keyword="Win32Proj"
- TargetFrameworkVersion="196613"
- >
- <Platforms>
- <Platform
- Name="Win32"
- />
- <Platform
- Name="x64"
- />
- </Platforms>
- <ToolFiles>
- <DefaultToolFile
- FileName="masm.rules"
- />
- </ToolFiles>
- <Configurations>
- <Configuration
- Name="Debug|Win32"
- OutputDirectory=".\..\..\..\bin\win32\Debug"
- IntermediateDirectory=".\..\..\..\obj\vp\Debug"
- ConfigurationType="2"
- CharacterSet="1"
- WholeProgramOptimization="0"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- CommandLine=""
- />
- <Tool
- Name="MASM"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
- MinimalRebuild="true"
- BasicRuntimeChecks="3"
- RuntimeLibrary="1"
- UsePrecompiledHeader="0"
- AssemblerListingLocation=""
- WarningLevel="3"
- DebugInformationFormat="4"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- LinkLibraryDependencies="true"
- OutputFile="$(OutDir)\welsvp.dll"
- LinkIncremental="2"
- ModuleDefinitionFile="../../src/common/WelsVP.def"
- GenerateDebugInformation="true"
- GenerateMapFile="true"
- MapFileName="$(OutDir)\welsvp.map"
- SubSystem="2"
- TargetMachine="1"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- CommandLine=""
- />
- </Configuration>
- <Configuration
- Name="Release|Win32"
- OutputDirectory=".\..\..\..\bin\win32\Release"
- IntermediateDirectory=".\..\..\..\obj\vp\Release"
- ConfigurationType="2"
- CharacterSet="1"
- WholeProgramOptimization="1"
- >
- <Tool
- Name="VCPreBuildEventTool"
- CommandLine=""
- />
- <Tool
- Name="VCCustomBuildTool"
- CommandLine=""
- />
- <Tool
- Name="MASM"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="3"
- EnableIntrinsicFunctions="false"
- FavorSizeOrSpeed="1"
- PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
- RuntimeLibrary="0"
- EnableFunctionLevelLinking="false"
- UsePrecompiledHeader="0"
- WarningLevel="3"
- DebugInformationFormat="0"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- OutputFile="$(OutDir)\welsvp.dll"
- LinkIncremental="1"
- GenerateManifest="false"
- EnableUAC="false"
- ModuleDefinitionFile="../../src/common/WelsVP.def"
- GenerateDebugInformation="false"
- GenerateMapFile="false"
- MapFileName=""
- MapExports="false"
- SubSystem="2"
- OptimizeReferences="2"
- EnableCOMDATFolding="2"
- TargetMachine="1"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- CommandLine=""
- />
- </Configuration>
- <Configuration
- Name="Debug|x64"
- OutputDirectory=".\..\..\..\bin\win32\Debug"
- IntermediateDirectory=".\..\..\..\obj\vp\Debug"
- ConfigurationType="2"
- CharacterSet="1"
- WholeProgramOptimization="0"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- CommandLine=""
- />
- <Tool
- Name="MASM"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- TargetEnvironment="3"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- AdditionalIncludeDirectories=""
- PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS"
- MinimalRebuild="true"
- BasicRuntimeChecks="3"
- RuntimeLibrary="1"
- UsePrecompiledHeader="0"
- AssemblerListingLocation=""
- WarningLevel="3"
- DebugInformationFormat="3"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- LinkLibraryDependencies="true"
- OutputFile="$(OutDir)\welsvp.dll"
- LinkIncremental="2"
- ModuleDefinitionFile="../../src/common/WelsVP.def"
- GenerateDebugInformation="true"
- GenerateMapFile="true"
- MapFileName="$(OutDir)\welsvp.map"
- SubSystem="2"
- TargetMachine="17"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- CommandLine=""
- />
- </Configuration>
- <Configuration
- Name="Release|x64"
- OutputDirectory=".\..\..\..\bin\win64\Release"
- IntermediateDirectory=".\..\..\..\obj\vp\Release"
- ConfigurationType="2"
- CharacterSet="1"
- WholeProgramOptimization="1"
- >
- <Tool
- Name="VCPreBuildEventTool"
- CommandLine=""
- />
- <Tool
- Name="VCCustomBuildTool"
- CommandLine=""
- />
- <Tool
- Name="MASM"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- TargetEnvironment="3"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="3"
- EnableIntrinsicFunctions="false"
- FavorSizeOrSpeed="1"
- PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS"
- RuntimeLibrary="0"
- EnableFunctionLevelLinking="false"
- UsePrecompiledHeader="0"
- WarningLevel="3"
- DebugInformationFormat="0"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- OutputFile="$(OutDir)\welsvp.dll"
- LinkIncremental="1"
- GenerateManifest="false"
- EnableUAC="false"
- ModuleDefinitionFile="../../src/common/WelsVP.def"
- GenerateDebugInformation="false"
- GenerateMapFile="false"
- MapFileName=""
- MapExports="false"
- SubSystem="2"
- OptimizeReferences="2"
- EnableCOMDATFolding="2"
- TargetMachine="17"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- CommandLine=""
- />
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <Filter
- Name="Source Files"
- Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
- UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
- >
- <File
- RelativePath="..\..\src\common\cpu.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\common\memory.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\common\thread.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\common\util.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\common\WelsFrameWork.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\common\WelsFrameWorkEx.cpp"
- >
- </File>
- </Filter>
- <Filter
- Name="Interface"
- Filter="h;hpp;hxx;hm;inl;inc;xsd"
- UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
- >
- <File
- RelativePath="..\..\interface\IWelsVP.h"
- >
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCLCompilerTool"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- >
- <Tool
- Name="VCCLCompilerTool"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\common\resource.h"
- >
- </File>
- </Filter>
- <Filter
- Name="Resource Files"
- Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
- UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
- >
- <File
- RelativePath="..\..\src\common\WelsVP.def"
- >
- </File>
- <File
- RelativePath="..\..\src\common\WelsVP.rc"
- >
- </File>
- </Filter>
- <Filter
- Name="Header Files"
- >
- <File
- RelativePath="..\..\src\common\cpu.h"
- >
- </File>
- <File
- RelativePath="..\..\src\common\memory.h"
- >
- </File>
- <File
- RelativePath="..\..\src\common\thread.h"
- >
- </File>
- <File
- RelativePath="..\..\src\common\typedef.h"
- >
- </File>
- <File
- RelativePath="..\..\src\common\util.h"
- >
- </File>
- <File
- RelativePath="..\..\src\common\version.h"
- >
- </File>
- <File
- RelativePath="..\..\src\common\WelsFrameWork.h"
- >
- </File>
- </Filter>
- <Filter
- Name="ASM"
- >
- <File
- RelativePath="..\..\src\asm\asm_inc.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\cpuid.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\denoisefilter.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\downsample_bilinear.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\intra_pred.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\sad.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- <File
- RelativePath="..\..\src\asm\vaa.asm"
- >
- <FileConfiguration
- Name="Debug|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|Win32"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Debug|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- <FileConfiguration
- Name="Release|x64"
- ExcludedFromBuild="true"
- >
- <Tool
- Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -f win32 -DPREFIX -o $(IntDir)\$(InputName).obj $(InputPath)
"
- Outputs="$(IntDir)\$(InputName).obj"
- />
- </FileConfiguration>
- </File>
- </Filter>
- <Filter
- Name="SceneChangeDetection"
- >
- <File
- RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\scenechangedetection\SceneChangeDetection.h"
- >
- </File>
- <File
- RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h"
- >
- </File>
- </Filter>
- <Filter
- Name="Denoise"
- >
- <File
- RelativePath="..\..\src\denoise\denoise.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\denoise\denoise.h"
- >
- </File>
- <File
- RelativePath="..\..\src\denoise\denoise_filter.cpp"
- >
- </File>
- </Filter>
- <Filter
- Name="VAACalc"
- >
- <File
- RelativePath="..\..\src\vaacalc\vaacalcfuncs.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\vaacalc\vaacalculation.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\vaacalc\vaacalculation.h"
- >
- </File>
- </Filter>
- <Filter
- Name="BackgroundDetection"
- >
- <File
- RelativePath="..\..\src\backgounddetection\BackgroundDetection.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\backgounddetection\BackgroundDetection.h"
- >
- </File>
- </Filter>
- <Filter
- Name="AdaptiveQuantization"
- >
- <File
- RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\adaptivequantization\AdaptiveQuantization.h"
- >
- </File>
- </Filter>
- <Filter
- Name="Downsample"
- >
- <File
- RelativePath="..\..\src\downsample\downsample.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\downsample\downsample.h"
- >
- </File>
- <File
- RelativePath="..\..\src\downsample\downsamplefuncs.cpp"
- >
- </File>
- </Filter>
- <Filter
- Name="ComplexityAnalysis"
- >
- <File
- RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\complexityanalysis\ComplexityAnalysis.h"
- >
- </File>
- </Filter>
- <Filter
- Name="ImageRotate"
- >
- <File
- RelativePath="..\..\src\imagerotate\imagerotate.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\imagerotate\imagerotate.h"
- >
- </File>
- <File
- RelativePath="..\..\src\imagerotate\imagerotatefuncs.cpp"
- >
- </File>
- </Filter>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
--- a/processing/build/win32/WelsVP_2010.sln
+++ /dev/null
@@ -1,20 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2010", "WelsVP_2010.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
- GlobalSection(SolutionConfigurationPlatforms) = preSolution
- Debug|Win32 = Debug|Win32
- Release|Win32 = Release|Win32
- EndGlobalSection
- GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
- EndGlobalSection
- GlobalSection(SolutionProperties) = preSolution
- HideSolutionNode = FALSE
- EndGlobalSection
-EndGlobal
binary files a/processing/build/win32/WelsVP_2010.suo /dev/null differ
--- a/processing/build/win32/WelsVP_2010.vcxproj
+++ /dev/null
@@ -1,435 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
- <ItemGroup Label="ProjectConfigurations">
- <ProjectConfiguration Include="Debug|Win32">
- <Configuration>Debug</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Debug|x64">
- <Configuration>Debug</Configuration>
- <Platform>x64</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Release|Win32">
- <Configuration>Release</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Release|x64">
- <Configuration>Release</Configuration>
- <Platform>x64</Platform>
- </ProjectConfiguration>
- </ItemGroup>
- <PropertyGroup Label="Globals">
- <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
- <RootNamespace>WelsVP</RootNamespace>
- <Keyword>Win32Proj</Keyword>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>false</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>false</WholeProgramOptimization>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
- <ImportGroup Label="ExtensionSettings">
- <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <PropertyGroup Label="UserMacros" />
- <PropertyGroup>
- <_ProjectFileVersion>10.0.40219.1</_ProjectFileVersion>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\bin\win32\Debug\</OutDir>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">.\..\..\..\bin\win64\Debug\</OutDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">.\..\..\..\obj\vp\Debug\</IntDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">.\..\..\..\obj\vp\Debug\</IntDir>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</LinkIncremental>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\bin\win32\Release\</OutDir>
- <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.\..\..\..\bin\win64\Release\</OutDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">.\..\..\..\obj\vp\Release\</IntDir>
- <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|x64'">.\..\..\..\obj\vp\Release\</IntDir>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
- <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</LinkIncremental>
- <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</GenerateManifest>
- <GenerateManifest Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</GenerateManifest>
- <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
- <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
- <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
- <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
- <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" />
- <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" />
- <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">AllRules.ruleset</CodeAnalysisRuleSet>
- <CodeAnalysisRuleSet Condition="'$(Configuration)|$(Platform)'=='Release|x64'">AllRules.ruleset</CodeAnalysisRuleSet>
- <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
- <CodeAnalysisRules Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
- <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" />
- <CodeAnalysisRuleAssemblies Condition="'$(Configuration)|$(Platform)'=='Release|x64'" />
- <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">welsvp</TargetName>
- <TargetName Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">welsvp</TargetName>
- <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">welsvp</TargetName>
- <TargetName Condition="'$(Configuration)|$(Platform)'=='Release|x64'">welsvp</TargetName>
- </PropertyGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
- <CustomBuildStep>
- <Command>
- </Command>
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <MinimalRebuild>true</MinimalRebuild>
- <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <AssemblerListingLocation>
- </AssemblerListingLocation>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
- </ClCompile>
- <ProjectReference>
- <LinkLibraryDependencies>true</LinkLibraryDependencies>
- </ProjectReference>
- <Link>
- <OutputFile>$(OutDir)welsvp.dll</OutputFile>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <SubSystem>Windows</SubSystem>
- <TargetMachine>MachineX86</TargetMachine>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
- <CustomBuildStep>
- <Command>
- </Command>
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Disabled</Optimization>
- <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
- <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <AssemblerListingLocation>
- </AssemblerListingLocation>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
- </ClCompile>
- <ProjectReference>
- <LinkLibraryDependencies>true</LinkLibraryDependencies>
- </ProjectReference>
- <Link>
- <OutputFile>$(OutDir)welsvp.dll</OutputFile>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <SubSystem>Windows</SubSystem>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
- <PreBuildEvent>
- <Command>
- </Command>
- </PreBuildEvent>
- <CustomBuildStep>
- <Command>
- </Command>
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Full</Optimization>
- <IntrinsicFunctions>false</IntrinsicFunctions>
- <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
- <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
- <FunctionLevelLinking>false</FunctionLevelLinking>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>
- </DebugInformationFormat>
- </ClCompile>
- <Link>
- <OutputFile>$(OutDir)welsvp.dll</OutputFile>
- <EnableUAC>false</EnableUAC>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>false</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <MapExports>true</MapExports>
- <SubSystem>Windows</SubSystem>
- <OptimizeReferences>true</OptimizeReferences>
- <EnableCOMDATFolding>true</EnableCOMDATFolding>
- <TargetMachine>MachineX86</TargetMachine>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
- <PreBuildEvent>
- <Command>
- </Command>
- </PreBuildEvent>
- <CustomBuildStep>
- <Command>
- </Command>
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Full</Optimization>
- <IntrinsicFunctions>false</IntrinsicFunctions>
- <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
- <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
- <FunctionLevelLinking>false</FunctionLevelLinking>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>
- </DebugInformationFormat>
- </ClCompile>
- <Link>
- <OutputFile>$(OutDir)welsvp.dll</OutputFile>
- <EnableUAC>false</EnableUAC>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>false</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <MapExports>true</MapExports>
- <SubSystem>Windows</SubSystem>
- <OptimizeReferences>true</OptimizeReferences>
- <EnableCOMDATFolding>true</EnableCOMDATFolding>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemGroup>
- <ClCompile Include="..\..\src\common\cpu.cpp" />
- <ClCompile Include="..\..\src\common\memory.cpp" />
- <ClCompile Include="..\..\src\common\thread.cpp" />
- <ClCompile Include="..\..\src\common\util.cpp" />
- <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
- <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
- <ClCompile Include="..\..\src\denoise\denoise.cpp" />
- <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
- <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
- <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
- <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
- <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
- <ClCompile Include="..\..\src\downsample\downsample.cpp" />
- <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
- <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
- <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
- <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
- </ItemGroup>
- <ItemGroup>
- <ClCompile Include="..\..\interface\IWelsVP.h" />
- <ClInclude Include="..\..\src\common\resource.h" />
- <ClInclude Include="..\..\src\common\cpu.h" />
- <ClInclude Include="..\..\src\common\memory.h" />
- <ClInclude Include="..\..\src\common\thread.h" />
- <ClInclude Include="..\..\src\common\typedef.h" />
- <ClInclude Include="..\..\src\common\util.h" />
- <ClInclude Include="..\..\src\common\version.h" />
- <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
- <ClInclude Include="..\..\src\denoise\denoise.h" />
- <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
- <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
- <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
- <ClInclude Include="..\..\src\downsample\downsample.h" />
- <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
- <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
- </ItemGroup>
- <ItemGroup>
- <None Include="..\..\src\common\WelsVP.def" />
- </ItemGroup>
- <ItemGroup>
- <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
- </ItemGroup>
- <ItemGroup>
- <CustomBuild Include="..\..\src\asm\asm_inc.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\cpuid.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\intra_pred.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\sad.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\vaa.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- </ItemGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
- <ImportGroup Label="ExtensionTargets">
- <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
- </ImportGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVP_2010.vcxproj.filters
+++ /dev/null
@@ -1,165 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
- <ItemGroup>
- <ClCompile Include="..\..\interface\IWelsVP.h">
- <Filter>headers</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\util.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\cpu.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\denoise\denoise.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\downsample\downsample.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\memory.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\thread.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- </ItemGroup>
- <ItemGroup>
- <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\cpu.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\denoise\denoise.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\downsample\downsample.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\memory.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\resource.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\thread.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\typedef.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\util.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\version.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\WelsFrameWork.h">
- <Filter>headers</Filter>
- </ClInclude>
- </ItemGroup>
- <ItemGroup>
- <CustomBuild Include="..\..\src\asm\asm_inc.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\cpuid.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\intra_pred.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\sad.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\vaa.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- </ItemGroup>
- <ItemGroup>
- <Filter Include="ASM">
- <UniqueIdentifier>{ecef07b7-65e1-45c4-9afc-39f7b07992a2}</UniqueIdentifier>
- </Filter>
- <Filter Include="headers">
- <UniqueIdentifier>{be24742a-75fa-49a4-b77e-a69d626d46c8}</UniqueIdentifier>
- </Filter>
- <Filter Include="sources">
- <UniqueIdentifier>{9f4c2bd3-e8d2-4276-adc6-273c0031971a}</UniqueIdentifier>
- </Filter>
- <Filter Include="resources">
- <UniqueIdentifier>{322f1cbe-435f-402b-8d86-71d023d5d407}</UniqueIdentifier>
- </Filter>
- </ItemGroup>
- <ItemGroup>
- <None Include="..\..\src\common\WelsVP.def">
- <Filter>resources</Filter>
- </None>
- </ItemGroup>
- <ItemGroup>
- <ResourceCompile Include="..\..\src\common\WelsVP.rc">
- <Filter>resources</Filter>
- </ResourceCompile>
- </ItemGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVP_2012.sln
+++ /dev/null
@@ -1,20 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2012
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP_2012", "WelsVP_2012.vcxproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
- GlobalSection(SolutionConfigurationPlatforms) = preSolution
- Debug|Win32 = Debug|Win32
- Release|Win32 = Release|Win32
- EndGlobalSection
- GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
- EndGlobalSection
- GlobalSection(SolutionProperties) = preSolution
- HideSolutionNode = FALSE
- EndGlobalSection
-EndGlobal
binary files a/processing/build/win32/WelsVP_2012.v11.suo /dev/null differ
--- a/processing/build/win32/WelsVP_2012.vcxproj
+++ /dev/null
@@ -1,427 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
- <ItemGroup Label="ProjectConfigurations">
- <ProjectConfiguration Include="Debug|Win32">
- <Configuration>Debug</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Debug|x64">
- <Configuration>Debug</Configuration>
- <Platform>x64</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Release|Win32">
- <Configuration>Release</Configuration>
- <Platform>Win32</Platform>
- </ProjectConfiguration>
- <ProjectConfiguration Include="Release|x64">
- <Configuration>Release</Configuration>
- <Platform>x64</Platform>
- </ProjectConfiguration>
- </ItemGroup>
- <PropertyGroup Label="Globals">
- <ProjectGuid>{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}</ProjectGuid>
- <RootNamespace>WelsVP</RootNamespace>
- <Keyword>Win32Proj</Keyword>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <PlatformToolset>v110</PlatformToolset>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <PlatformToolset>v110</PlatformToolset>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>true</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <PlatformToolset>v110</PlatformToolset>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>false</WholeProgramOptimization>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
- <ConfigurationType>DynamicLibrary</ConfigurationType>
- <PlatformToolset>v110</PlatformToolset>
- <CharacterSet>Unicode</CharacterSet>
- <WholeProgramOptimization>false</WholeProgramOptimization>
- </PropertyGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
- <ImportGroup Label="ExtensionSettings">
- <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
- <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
- </ImportGroup>
- <PropertyGroup Label="UserMacros" />
- <PropertyGroup>
- <_ProjectFileVersion>11.0.61030.0</_ProjectFileVersion>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
- <OutDir>.\..\..\..\bin\win32\Debug\</OutDir>
- <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
- <LinkIncremental>true</LinkIncremental>
- <TargetName>welsvp</TargetName>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
- <LinkIncremental>true</LinkIncremental>
- <TargetName>welsvp</TargetName>
- <OutDir>.\..\..\..\bin\win64\Debug\</OutDir>
- <IntDir>.\..\..\..\obj\vp\Debug\</IntDir>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
- <OutDir>.\..\..\..\bin\win32\Release\</OutDir>
- <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
- <LinkIncremental>false</LinkIncremental>
- <GenerateManifest>false</GenerateManifest>
- <TargetName>welsvp</TargetName>
- </PropertyGroup>
- <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
- <LinkIncremental>false</LinkIncremental>
- <GenerateManifest>false</GenerateManifest>
- <TargetName>welsvp</TargetName>
- <OutDir>.\..\..\..\bin\win64\Release\</OutDir>
- <IntDir>.\..\..\..\obj\vp\Release\</IntDir>
- </PropertyGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
- <CustomBuildStep>
- <Command />
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Disabled</Optimization>
- <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <MinimalRebuild>true</MinimalRebuild>
- <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
- <PrecompiledHeader />
- <AssemblerListingLocation />
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
- </ClCompile>
- <ProjectReference>
- <LinkLibraryDependencies>true</LinkLibraryDependencies>
- </ProjectReference>
- <Link>
- <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <SubSystem>Windows</SubSystem>
- <TargetMachine>MachineX86</TargetMachine>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
- <CustomBuildStep>
- <Command>
- </Command>
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Disabled</Optimization>
- <PreprocessorDefinitions>WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
- <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <AssemblerListingLocation>
- </AssemblerListingLocation>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
- </ClCompile>
- <ProjectReference>
- <LinkLibraryDependencies>true</LinkLibraryDependencies>
- </ProjectReference>
- <Link>
- <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>true</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <SubSystem>Windows</SubSystem>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
- <PreBuildEvent>
- <Command>
- </Command>
- </PreBuildEvent>
- <CustomBuildStep>
- <Command />
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Full</Optimization>
- <IntrinsicFunctions>false</IntrinsicFunctions>
- <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
- <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
- <FunctionLevelLinking>false</FunctionLevelLinking>
- <PrecompiledHeader />
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat />
- </ClCompile>
- <Link>
- <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
- <EnableUAC>false</EnableUAC>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>false</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <MapExports>true</MapExports>
- <SubSystem>Windows</SubSystem>
- <OptimizeReferences>true</OptimizeReferences>
- <EnableCOMDATFolding>true</EnableCOMDATFolding>
- <TargetMachine>MachineX86</TargetMachine>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
- <PreBuildEvent>
- <Command>
- </Command>
- </PreBuildEvent>
- <CustomBuildStep>
- <Command>
- </Command>
- </CustomBuildStep>
- <ClCompile>
- <Optimization>Full</Optimization>
- <IntrinsicFunctions>false</IntrinsicFunctions>
- <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
- <PreprocessorDefinitions>WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
- <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
- <FunctionLevelLinking>false</FunctionLevelLinking>
- <PrecompiledHeader>
- </PrecompiledHeader>
- <WarningLevel>Level3</WarningLevel>
- <DebugInformationFormat>
- </DebugInformationFormat>
- </ClCompile>
- <Link>
- <OutputFile>$(OutDir)\welsvp.dll</OutputFile>
- <EnableUAC>false</EnableUAC>
- <ModuleDefinitionFile>../../src/common/WelsVP.def</ModuleDefinitionFile>
- <GenerateDebugInformation>false</GenerateDebugInformation>
- <GenerateMapFile>true</GenerateMapFile>
- <MapFileName>$(OutDir)\welsvp.map</MapFileName>
- <MapExports>true</MapExports>
- <SubSystem>Windows</SubSystem>
- <OptimizeReferences>true</OptimizeReferences>
- <EnableCOMDATFolding>true</EnableCOMDATFolding>
- <ProgramDatabaseFile>$(OutDir)\welsvp.pdb</ProgramDatabaseFile>
- <ImportLibrary>$(OutDir)\welsvp.lib</ImportLibrary>
- <ProfileGuidedDatabase>$(OutDir)\welsvp.pgd</ProfileGuidedDatabase>
- </Link>
- <PostBuildEvent>
- <Command>
- </Command>
- </PostBuildEvent>
- <Bscmake>
- <OutputFile>$(OutDir)\welsvp.bsc</OutputFile>
- </Bscmake>
- </ItemDefinitionGroup>
- <ItemGroup>
- <ClCompile Include="..\..\src\common\cpu.cpp" />
- <ClCompile Include="..\..\src\common\memory.cpp" />
- <ClCompile Include="..\..\src\common\thread.cpp" />
- <ClCompile Include="..\..\src\common\util.cpp" />
- <ClCompile Include="..\..\src\common\WelsFrameWork.cpp" />
- <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp" />
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp" />
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp" />
- <ClCompile Include="..\..\src\denoise\denoise.cpp" />
- <ClCompile Include="..\..\src\denoise\denoise_filter.cpp" />
- <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp" />
- <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp" />
- <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp" />
- <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp" />
- <ClCompile Include="..\..\src\downsample\downsample.cpp" />
- <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp" />
- <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp" />
- <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp" />
- <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp" />
- </ItemGroup>
- <ItemGroup>
- <ClCompile Include="..\..\interface\IWelsVP.h" />
- <ClInclude Include="..\..\src\common\resource.h" />
- <ClInclude Include="..\..\src\common\cpu.h" />
- <ClInclude Include="..\..\src\common\memory.h" />
- <ClInclude Include="..\..\src\common\thread.h" />
- <ClInclude Include="..\..\src\common\typedef.h" />
- <ClInclude Include="..\..\src\common\util.h" />
- <ClInclude Include="..\..\src\common\version.h" />
- <ClInclude Include="..\..\src\common\WelsFrameWork.h" />
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h" />
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h" />
- <ClInclude Include="..\..\src\denoise\denoise.h" />
- <ClInclude Include="..\..\src\vaacalc\vaacalculation.h" />
- <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h" />
- <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h" />
- <ClInclude Include="..\..\src\downsample\downsample.h" />
- <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h" />
- <ClInclude Include="..\..\src\imagerotate\imagerotate.h" />
- </ItemGroup>
- <ItemGroup>
- <None Include="..\..\src\common\WelsVP.def" />
- </ItemGroup>
- <ItemGroup>
- <ResourceCompile Include="..\..\src\common\WelsVP.rc" />
- </ItemGroup>
- <ItemGroup>
- <CustomBuild Include="..\..\src\asm\asm_inc.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\cpuid.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\intra_pred.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\sad.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\vaa.asm">
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -I%(RootDir)%(Directory) -f win32 -DPREFIX -o $(IntDir)%(Filename).obj %(FullPath)
-</Command>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj;%(Outputs)</Outputs>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
- <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
- </CustomBuild>
- </ItemGroup>
- <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
- <ImportGroup Label="ExtensionTargets">
- <Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
- </ImportGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVP_2012.vcxproj.filters
+++ /dev/null
@@ -1,165 +1,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
- <ItemGroup>
- <ClCompile Include="..\..\interface\IWelsVP.h">
- <Filter>headers</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\adaptivequantization\AdaptiveQuantization.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\backgounddetection\BackgroundDetection.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\complexityanalysis\ComplexityAnalysis.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\cpu.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\denoise\denoise.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\denoise\denoise_filter.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\downsample\downsample.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\downsample\downsamplefuncs.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\imagerotate\imagerotate.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\imagerotate\imagerotatefuncs.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\memory.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetection.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\thread.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\util.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\vaacalc\vaacalcfuncs.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\vaacalc\vaacalculation.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\WelsFrameWork.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- <ClCompile Include="..\..\src\common\WelsFrameWorkEx.cpp">
- <Filter>sources</Filter>
- </ClCompile>
- </ItemGroup>
- <ItemGroup>
- <ClInclude Include="..\..\src\adaptivequantization\AdaptiveQuantization.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\backgounddetection\BackgroundDetection.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\complexityanalysis\ComplexityAnalysis.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\cpu.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\denoise\denoise.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\downsample\downsample.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\imagerotate\imagerotate.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\memory.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\resource.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetection.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\scenechangedetection\SceneChangeDetectionCommon.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\thread.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\typedef.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\util.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\vaacalc\vaacalculation.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\version.h">
- <Filter>headers</Filter>
- </ClInclude>
- <ClInclude Include="..\..\src\common\WelsFrameWork.h">
- <Filter>headers</Filter>
- </ClInclude>
- </ItemGroup>
- <ItemGroup>
- <CustomBuild Include="..\..\src\asm\asm_inc.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\cpuid.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\denoisefilter.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\downsample_bilinear.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\intra_pred.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\sad.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- <CustomBuild Include="..\..\src\asm\vaa.asm">
- <Filter>ASM</Filter>
- </CustomBuild>
- </ItemGroup>
- <ItemGroup>
- <Filter Include="ASM">
- <UniqueIdentifier>{18a2a593-cf54-452e-bf69-5eaf9aac6518}</UniqueIdentifier>
- </Filter>
- <Filter Include="headers">
- <UniqueIdentifier>{5a921557-4f54-4838-80de-8c517b1d099b}</UniqueIdentifier>
- </Filter>
- <Filter Include="sources">
- <UniqueIdentifier>{0b628696-109b-4a2e-b11f-5e9e006b76ae}</UniqueIdentifier>
- </Filter>
- <Filter Include="resources">
- <UniqueIdentifier>{94dba5f3-1b39-4ccd-891b-6a70cb59f210}</UniqueIdentifier>
- </Filter>
- </ItemGroup>
- <ItemGroup>
- <ResourceCompile Include="..\..\src\common\WelsVP.rc">
- <Filter>resources</Filter>
- </ResourceCompile>
- </ItemGroup>
- <ItemGroup>
- <None Include="..\..\src\common\WelsVP.def">
- <Filter>resources</Filter>
- </None>
- </ItemGroup>
-</Project>
\ No newline at end of file
--- a/processing/build/win32/WelsVideoProcessor.sln
+++ /dev/null
@@ -1,29 +1,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 10.00
-# Visual Studio 2008
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVideoProcessor", "WelsVideoProcessor.vcproj", "{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
- ProjectSection(ProjectDependencies) = postProject
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562} = {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}
- EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WelsVP", "WelsVP.vcproj", "{E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}"
-EndProject
-Global
- GlobalSection(SolutionConfigurationPlatforms) = preSolution
- Debug|Win32 = Debug|Win32
- Release|Win32 = Release|Win32
- EndGlobalSection
- GlobalSection(ProjectConfigurationPlatforms) = postSolution
- {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.ActiveCfg = Debug|Win32
- {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Debug|Win32.Build.0 = Debug|Win32
- {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.ActiveCfg = Release|Win32
- {C57D1D0E-A09A-45FD-87F9-CC6911F601FA}.Release|Win32.Build.0 = Release|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.ActiveCfg = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Debug|Win32.Build.0 = Debug|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.ActiveCfg = Release|Win32
- {E8DFAFA1-8DAC-4127-8D27-FBD5819EE562}.Release|Win32.Build.0 = Release|Win32
- EndGlobalSection
- GlobalSection(SolutionProperties) = preSolution
- HideSolutionNode = FALSE
- EndGlobalSection
-EndGlobal
--- a/processing/build/win32/WelsVideoProcessor.vcproj
+++ /dev/null
@@ -1,213 +1,0 @@
-<?xml version="1.0" encoding="gb2312"?>
-<VisualStudioProject
- ProjectType="Visual C++"
- Version="9.00"
- Name="WelsVideoProcessor"
- ProjectGUID="{C57D1D0E-A09A-45FD-87F9-CC6911F601FA}"
- RootNamespace="WelsVideoProcessor"
- Keyword="Win32Proj"
- TargetFrameworkVersion="196613"
- >
- <Platforms>
- <Platform
- Name="Win32"
- />
- </Platforms>
- <ToolFiles>
- </ToolFiles>
- <Configurations>
- <Configuration
- Name="Debug|Win32"
- OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
- IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
- ConfigurationType="1"
- CharacterSet="1"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="0"
- PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
- MinimalRebuild="true"
- BasicRuntimeChecks="3"
- RuntimeLibrary="1"
- UsePrecompiledHeader="0"
- WarningLevel="3"
- DebugInformationFormat="4"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- LinkIncremental="2"
- GenerateDebugInformation="true"
- SubSystem="1"
- TargetMachine="1"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- <Configuration
- Name="Release|Win32"
- OutputDirectory="$(SolutionDir)..\..\bin\$(ConfigurationName)"
- IntermediateDirectory="$(SolutionDir)..\..\obj\$(ConfigurationName)\$(ProjectName)"
- ConfigurationType="1"
- CharacterSet="1"
- WholeProgramOptimization="1"
- >
- <Tool
- Name="VCPreBuildEventTool"
- />
- <Tool
- Name="VCCustomBuildTool"
- />
- <Tool
- Name="VCXMLDataGeneratorTool"
- />
- <Tool
- Name="VCWebServiceProxyGeneratorTool"
- />
- <Tool
- Name="VCMIDLTool"
- />
- <Tool
- Name="VCCLCompilerTool"
- Optimization="2"
- EnableIntrinsicFunctions="true"
- PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
- RuntimeLibrary="0"
- EnableFunctionLevelLinking="true"
- UsePrecompiledHeader="0"
- WarningLevel="3"
- DebugInformationFormat="3"
- />
- <Tool
- Name="VCManagedResourceCompilerTool"
- />
- <Tool
- Name="VCResourceCompilerTool"
- />
- <Tool
- Name="VCPreLinkEventTool"
- />
- <Tool
- Name="VCLinkerTool"
- LinkIncremental="1"
- GenerateDebugInformation="true"
- SubSystem="1"
- OptimizeReferences="2"
- EnableCOMDATFolding="2"
- TargetMachine="1"
- />
- <Tool
- Name="VCALinkTool"
- />
- <Tool
- Name="VCManifestTool"
- />
- <Tool
- Name="VCXDCMakeTool"
- />
- <Tool
- Name="VCBscMakeTool"
- />
- <Tool
- Name="VCFxCopTool"
- />
- <Tool
- Name="VCAppVerifierTool"
- />
- <Tool
- Name="VCPostBuildEventTool"
- />
- </Configuration>
- </Configurations>
- <References>
- </References>
- <Files>
- <Filter
- Name="Source Files"
- Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
- UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
- >
- <File
- RelativePath="..\..\src\testbed\stdafx.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\testbed\wels_process.cpp"
- >
- </File>
- <File
- RelativePath="..\..\src\testbed\WelsVideoProcessor.cpp"
- >
- </File>
- </Filter>
- <Filter
- Name="Header Files"
- Filter="h;hpp;hxx;hm;inl;inc;xsd"
- UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
- >
- <File
- RelativePath="..\..\src\testbed\stdafx.h"
- >
- </File>
- <File
- RelativePath="..\..\src\testbed\targetver.h"
- >
- </File>
- <File
- RelativePath="..\..\src\testbed\wels_process.h"
- >
- </File>
- </Filter>
- <Filter
- Name="Resource Files"
- Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
- UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
- >
- </Filter>
- </Files>
- <Globals>
- </Globals>
-</VisualStudioProject>
--- a/processing/interface/IWelsVP.h
+++ /dev/null
@@ -1,286 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2004-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- *
- * \file : IWelsVP.h
- *
- * \brief : Interface of wels video processor class
- *
- * \date : 2011/01/04
- *
- * \description : 1. should support both C/C++ style interface
- * 2. should concern with the feature extension requirement
- * 3. should care the usage of "char"==>
- * 1) value char : signed char/unsigned char
- * 2) string char : char
- *
- *************************************************************************************
- */
-
-#ifndef IWELSVP_H_
-#define IWELSVP_H_
-
-#ifdef _WIN32
-#define WELSAPI __stdcall
-#else
-#define WELSAPI
-#endif
-
-#define WELSVP_MAJOR_VERSION 1
-#define WELSVP_MINOR_VERSION 1
-#define WELSVP_VERSION ((WELSVP_MAJOR_VERSION << 8) + WELSVP_MINOR_VERSION)
-
-typedef enum {
- RET_SUCCESS = 0,
- RET_FAILED = -1,
- RET_INVALIDPARAM = -2,
- RET_OUTOFMEMORY = -3,
- RET_NOTSUPPORTED = -4,
- RET_UNEXPECTED = -5,
- RET_NEEDREINIT = -6
-} EResult;
-
-typedef enum {
- VIDEO_FORMAT_NULL = 0, /* invalid format */
- /*rgb color formats*/
- VIDEO_FORMAT_RGB = 1, /* rgb 24bits */
- VIDEO_FORMAT_RGBA = 2, /* rgba */
- VIDEO_FORMAT_RGB555 = 3, /* rgb555 */
- VIDEO_FORMAT_RGB565 = 4, /* rgb565 */
- VIDEO_FORMAT_BGR = 5, /* bgr 24bits */
- VIDEO_FORMAT_BGRA = 6, /* bgr 32bits */
- VIDEO_FORMAT_ABGR = 7, /* abgr */
- VIDEO_FORMAT_ARGB = 8, /* argb */
-
- /*yuv color formats*/
- VIDEO_FORMAT_YUY2 = 20, /* yuy2 */
- VIDEO_FORMAT_YVYU = 21, /* yvyu */
- VIDEO_FORMAT_UYVY = 22, /* uyvy */
- VIDEO_FORMAT_I420 = 23, /* yuv 4:2:0 planar */
- VIDEO_FORMAT_YV12 = 24, /* yuv 4:2:0 planar */
- VIDEO_FORMAT_INTERNAL = 25, /* Only Used for SVC decoder testbed */
- VIDEO_FORMAT_NV12 = 26, /* y planar + uv packed */
- VIDEO_FORMAT_I422 = 27, /* yuv 4:2:2 planar */
- VIDEO_FORMAT_I444 = 28, /* yuv 4:4:4 planar */
- VIDEO_FORMAT_YUYV = 20, /* yuv 4:2:2 packed */
-
- VIDEO_FORMAT_RGB24 = 1,
- VIDEO_FORMAT_RGB32 = 2,
- VIDEO_FORMAT_RGB24_INV = 5,
- VIDEO_FORMAT_RGB32_INV = 6,
- VIDEO_FORMAT_RGB555_INV = 7,
- VIDEO_FORMAT_RGB565_INV = 8,
- VIDEO_FORMAT_YUV2 = 21,
- VIDEO_FORMAT_420 = 23,
-
- VIDEO_FORMAT_VFlip = 0x80000000
-} EVideoFormat;
-
-typedef enum {
- BUFFER_HOSTMEM = 0,
- BUFFER_SURFACE
-} EPixMapBufferProperty;
-
-typedef struct {
- int iRectTop;
- int iRectLeft;
- int iRectWidth;
- int iRectHeight;
-} SRect;
-
-typedef struct {
- void* pPixel[3];
- int iSizeInBits;
- int iStride[3];
- SRect sRect;
- EVideoFormat eFormat;
- EPixMapBufferProperty eProperty;//not use? to remove? but how about the size of SPixMap?
-} SPixMap;
-
-typedef enum {
- METHOD_NULL = 0,
- METHOD_COLORSPACE_CONVERT ,//not support yet
- METHOD_DENOISE ,
- METHOD_SCENE_CHANGE_DETECTION ,
- METHOD_DOWNSAMPLE ,
- METHOD_VAA_STATISTICS ,
- METHOD_BACKGROUND_DETECTION ,
- METHOD_ADAPTIVE_QUANT ,
- METHOD_COMPLEXITY_ANALYSIS ,
- METHOD_IMAGE_ROTATE ,
- METHOD_MASK
-} EMethods;
-
-//-----------------------------------------------------------------//
-// Algorithm parameters define
-//-----------------------------------------------------------------//
-
-typedef struct {
- int bSceneChangeFlag; // 0:false ; 1:true
-} SSceneChangeResult;
-
-typedef enum {
- SIMILAR_SCENE, //similar scene
- MEDIUM_CHANGED_SCENE, //medium changed scene
- LARGE_CHANGED_SCENE, //large changed scene
-} ESceneChangeIdc;
-
-typedef struct {
- unsigned char* pCurY; // Y data of current frame
- unsigned char* pRefY; // Y data of pRef frame for diff calc
- int (*pSad8x8)[4]; // sad of 8x8, every 4 in the same 16x16 get together
- int* pSsd16x16; // sum of square difference of 16x16
- int* pSum16x16; // sum of 16x16
- int* pSumOfSquare16x16; // sum of square of 16x16
- int (*pSumOfDiff8x8)[4];
- unsigned char (*pMad8x8)[4];
- int iFrameSad; // sad of frame
-} SVAACalcResult;
-
-typedef struct {
- int iCalcVar;
- int iCalcBgd;
- int iCalcSsd;
- int iReserved;
- SVAACalcResult* pCalcResult;
-} SVAACalcParam;
-
-typedef struct {
- signed char* pBackgroundMbFlag;
- SVAACalcResult* pCalcRes;
-} SBGDInterface;
-
-typedef enum {
- AQ_QUALITY_MODE, //Quality mode
- AQ_BITRATE_MODE, //Bitrate mode
-} EAQModes;
-
-typedef struct {
- unsigned short uiMotionIndex;
- unsigned short uiTextureIndex;
-} SMotionTextureUnit;
-
-typedef struct {
- int iAdaptiveQuantMode; // 0:quality mode, 1:bitrates mode
- SVAACalcResult* pCalcResult;
- SMotionTextureUnit* pMotionTextureUnit;
-
- signed char* pMotionTextureIndexToDeltaQp;
- double dAverMotionTextureIndexToDeltaQp;
-} SAdaptiveQuantizationParam;
-
-typedef enum {
- FRAME_SAD = 0,
- GOM_SAD = -1,
- GOM_VAR = -2
-} EComplexityAnalysisMode;
-
-typedef struct {
- int iComplexityAnalysisMode;
- int iCalcBgd;
- int iMbNumInGom;
- int iFrameComplexity;
- int* pGomComplexity;
- int* pGomForegroundBlockNum;
- signed char* pBackgroundMbFlag;
- unsigned int* uiRefMbType;
- SVAACalcResult* pCalcResult;
-} SComplexityAnalysisParam;
-
-/////////////////////////////////////////////////////////////////////////////////////////////
-
-typedef struct {
- void* pCtx;
- EResult (*Init) (void* pCtx, int iType, void* pCfg);
- EResult (*Uninit) (void* pCtx, int iType);
- EResult (*Flush) (void* pCtx, int iType);
- EResult (*Process) (void* pCtx, int iType, SPixMap* pSrc, SPixMap* dst);
- EResult (*Get) (void* pCtx, int iType, void* pParam);
- EResult (*Set) (void* pCtx, int iType, void* pParam);
- EResult (*SpecialFeature) (void* pCtx, int iType, void* pIn, void* pOut);
-} IWelsVPc;
-
-#if defined(__cplusplus) && !defined(CINTERFACE) /* C++ style interface */
-
-class IWelsVP {
- public:
- virtual ~IWelsVP() {}
-
- public:
- virtual EResult Init (int iType, void* pCfg) = 0;
- virtual EResult Uninit (int iType) = 0;
- virtual EResult Flush (int iType) = 0;
- virtual EResult Process (int iType, SPixMap* pSrc, SPixMap* dst) = 0;
- virtual EResult Get (int iType, void* pParam) = 0;
- virtual EResult Set (int iType, void* pParam) = 0;
- virtual EResult SpecialFeature (int iType, void* pIn, void* pOut) = 0;
-};
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b) (p)->Init(a, b)
-#define IWelsVPFunc_Uninit(p, a) (p)->Uninit(a)
-#define IWelsVPFunc_Flush(p, a) (p)->Flush(a)
-#define IWelsVPFunc_Process(p, a, b, c) (p)->Process(a, b, c)
-#define IWelsVPFunc_Get(p, a, b) (p)->Get(a, b)
-#define IWelsVPFunc_Set(p, a, b) (p)->Set(a, b)
-#define IWelsVPFunc_SpecialFeature(p, a, b, c) (p)->SpecialFeature(a, b, c)
-
-/* C++ interface version */
-#define WELSVP_INTERFACE_VERION (0x8000 + (WELSVP_VERSION & 0x7fff))
-#define WELSVP_EXTERNC_BEGIN extern "C" {
-#define WELSVP_EXTERNC_END }
-
-#else /* C style interface */
-
-/* Recommend to invoke the interface via the micro for convenient */
-#define IWelsVPFunc_Init(p, a, b) (p)->Init(p->h, a, b)
-#define IWelsVPFunc_Uninit(p, a) (p)->Uninit(p->h, a)
-#define IWelsVPFunc_Flush(p, a) (p)->Flush(p->h, a)
-#define IWelsVPFunc_Process(p, a, b, c) (p)->Process(p->h, a, b, c)
-#define IWelsVPFunc_Get(p, a, b) (p)->Get(p->h, a, b)
-#define IWelsVPFunc_Set(p, a, b) (p)->Set(p->h, a, b)
-#define IWelsVPFunc_SpecialFeature(p, a, b, c) (p)->SpecialFeature(p->h, a, b, c)
-
-/* C interface version */
-#define WELSVP_INTERFACE_VERION (0x0001 + (WELSVP_VERSION & 0x7fff))
-#define WELSVP_EXTERNC_BEGIN
-#define WELSVP_EXTERNC_END
-
-#endif
-
-WELSVP_EXTERNC_BEGIN
-EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion /*= WELSVP_INTERFACE_VERION*/);
-EResult WELSAPI DestroyVpInterface (void* pCtx , int iVersion /*= WELSVP_INTERFACE_VERION*/);
-WELSVP_EXTERNC_END
-
-//////////////////////////////////////////////////////////////////////////////////////////////
-#endif // IWELSVP_H_
-
-
--- a/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ /dev/null
@@ -1,256 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-#include "AdaptiveQuantization.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-
-#define AVERAGE_TIME_MOTION (0.3) //0.3046875 // 1/4 + 1/16 - 1/128 ~ 0.3
-#define AVERAGE_TIME_TEXTURE_QUALITYMODE (1.0) //0.5 // 1/2
-#define AVERAGE_TIME_TEXTURE_BITRATEMODE (0.875) //0.5 // 1/2
-#define MODEL_ALPHA (0.9910) //1.5 //1.1102
-#define MODEL_TIME (5.8185) //9.0 //5.9842
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CAdaptiveQuantization::CAdaptiveQuantization (int32_t iCpuFlag) {
- m_CPUFlag = iCpuFlag;
- m_eMethod = METHOD_ADAPTIVE_QUANT;
- m_pfVar = NULL;
- WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
- WelsInitVarFunc (m_pfVar, m_CPUFlag);
-}
-
-CAdaptiveQuantization::~CAdaptiveQuantization() {
-}
-
-EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- EResult eReturn = RET_INVALIDPARAM;
-
- int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
- int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
- int32_t iMbWidth = iWidth >> 4;
- int32_t iMbHeight = iHeight >> 4;
- int32_t iMbTotalNum = iMbWidth * iMbHeight;
-
- SMotionTextureUnit* pMotionTexture = NULL;
- SVAACalcResult* pVaaCalcResults = NULL;
- int8_t iMotionTextureIndexToDeltaQp = 0;
- int32_t iAverMotionTextureIndexToDeltaQp = 0; // double to uint32
- double_t dAverageMotionIndex = 0.0; // double to float
- double_t dAverageTextureIndex = 0.0;
-
- double_t dQStep = 0.0;
- double_t dLumaMotionDeltaQp = 0;
- double_t dLumaTextureDeltaQp = 0;
-
- uint8_t* pRefFrameY = NULL, *pCurFrameY = NULL;
- int32_t iRefStride = 0, iCurStride = 0;
-
- uint8_t* pRefFrameTmp = NULL, *pCurFrameTmp = NULL;
- int32_t i = 0, j = 0;
-
- pRefFrameY = (uint8_t*)pRefPixMap->pPixel[0];
- pCurFrameY = (uint8_t*)pSrcPixMap->pPixel[0];
-
- iRefStride = pRefPixMap->iStride[0];
- iCurStride = pSrcPixMap->iStride[0];
-
- /////////////////////////////////////// motion //////////////////////////////////
- // motion MB residual variance
- dAverageMotionIndex = 0.0;
- dAverageTextureIndex = 0.0;
- pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
- pVaaCalcResults = m_sAdaptiveQuantParam.pCalcResult;
-
- if (pVaaCalcResults->pRefY == pRefFrameY && pVaaCalcResults->pCurY == pCurFrameY) {
- int32_t iMbIndex = 0;
- int32_t iSumDiff, iSQDiff, uiSum, iSQSum;
- for (j = 0; j < iMbHeight; j ++) {
- pRefFrameTmp = pRefFrameY;
- pCurFrameTmp = pCurFrameY;
- for (i = 0; i < iMbWidth; i++) {
- iSumDiff = pVaaCalcResults->pSad8x8[iMbIndex][0];
- iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
- iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
- iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][3];
-
- iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
- uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
- iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
-
- iSumDiff = iSumDiff >> 8;
- pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
-
- uiSum = uiSum >> 8;
- pMotionTexture->uiTextureIndex = (iSQSum >> 8) - (uiSum * uiSum);
-
- dAverageMotionIndex += pMotionTexture->uiMotionIndex;
- dAverageTextureIndex += pMotionTexture->uiTextureIndex;
- pMotionTexture++;
- ++iMbIndex;
- pRefFrameTmp += MB_WIDTH_LUMA;
- pCurFrameTmp += MB_WIDTH_LUMA;
- }
- pRefFrameY += (iRefStride) << 4;
- pCurFrameY += (iCurStride) << 4;
- }
- } else {
- for (j = 0; j < iMbHeight; j ++) {
- pRefFrameTmp = pRefFrameY;
- pCurFrameTmp = pCurFrameY;
- for (i = 0; i < iMbWidth; i++) {
- m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
- dAverageMotionIndex += pMotionTexture->uiMotionIndex;
- dAverageTextureIndex += pMotionTexture->uiTextureIndex;
- pMotionTexture++;
- pRefFrameTmp += MB_WIDTH_LUMA;
- pCurFrameTmp += MB_WIDTH_LUMA;
-
- }
- pRefFrameY += (iRefStride) << 4;
- pCurFrameY += (iCurStride) << 4;
- }
- }
- dAverageMotionIndex = dAverageMotionIndex / iMbTotalNum;
- dAverageTextureIndex = dAverageTextureIndex / iMbTotalNum;
- if ((dAverageMotionIndex <= PESN) && (dAverageMotionIndex >= -PESN)) {
- dAverageMotionIndex = 1.0;
- }
- if ((dAverageTextureIndex <= PESN) && (dAverageTextureIndex >= -PESN)) {
- dAverageTextureIndex = 1.0;
- }
- // motion mb residual map to QP
- // texture mb original map to QP
- iAverMotionTextureIndexToDeltaQp = 0;
- dAverageMotionIndex = AVERAGE_TIME_MOTION * dAverageMotionIndex;
-
- if (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE) {
- dAverageTextureIndex = AVERAGE_TIME_TEXTURE_QUALITYMODE * dAverageTextureIndex;
- } else {
- dAverageTextureIndex = AVERAGE_TIME_TEXTURE_BITRATEMODE * dAverageTextureIndex;
- }
-
- pMotionTexture = m_sAdaptiveQuantParam.pMotionTextureUnit;
- for (j = 0; j < iMbHeight; j ++) {
- for (i = 0; i < iMbWidth; i++) {
- double_t a = pMotionTexture->uiTextureIndex / dAverageTextureIndex;
- dQStep = (a - 1) / (a + MODEL_ALPHA);
- dLumaTextureDeltaQp = MODEL_TIME * dQStep;// range +- 6
-
- iMotionTextureIndexToDeltaQp = (int8_t)dLumaTextureDeltaQp;
-
- a = pMotionTexture->uiMotionIndex / dAverageMotionIndex;
- dQStep = (a - 1) / (a + MODEL_ALPHA);
- dLumaMotionDeltaQp = MODEL_TIME * dQStep;// range +- 6
-
- if ((m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_QUALITY_MODE && dLumaMotionDeltaQp < -PESN)
- || (m_sAdaptiveQuantParam.iAdaptiveQuantMode == AQ_BITRATE_MODE)) {
- iMotionTextureIndexToDeltaQp += (int8_t)dLumaMotionDeltaQp;
- }
-
- m_sAdaptiveQuantParam.pMotionTextureIndexToDeltaQp[j * iMbWidth + i] = iMotionTextureIndexToDeltaQp;
- iAverMotionTextureIndexToDeltaQp += iMotionTextureIndexToDeltaQp;
- pMotionTexture++;
- }
- }
- m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp = (1.0 * iAverMotionTextureIndexToDeltaQp) / iMbTotalNum;
-
- eReturn = RET_SUCCESS;
-
- return eReturn;
-}
-
-
-
-EResult CAdaptiveQuantization::Set (int32_t iType, void* pParam) {
- if (pParam == NULL) {
- return RET_INVALIDPARAM;
- }
-
- m_sAdaptiveQuantParam = * (SAdaptiveQuantizationParam*)pParam;
-
- return RET_SUCCESS;
-}
-
-EResult CAdaptiveQuantization::Get (int32_t iType, void* pParam) {
- if (pParam == NULL) {
- return RET_INVALIDPARAM;
- }
-
- SAdaptiveQuantizationParam* sAdaptiveQuantParam = (SAdaptiveQuantizationParam*)pParam;
-
- sAdaptiveQuantParam->dAverMotionTextureIndexToDeltaQp = m_sAdaptiveQuantParam.dAverMotionTextureIndexToDeltaQp;
-
- return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void CAdaptiveQuantization::WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag) {
- pfVar = SampleVariance16x16_c;
-
-#ifdef X86_ASM
- if (iCpuFlag & WELS_CPU_SSE2) {
- pfVar = SampleVariance16x16_sse2;
- }
-#endif
-}
-
-void SampleVariance16x16_c (uint8_t* pRefY, int32_t iRefStride, uint8_t* pSrcY, int32_t iSrcStride,
- SMotionTextureUnit* pMotionTexture) {
- uint32_t uiCurSquare = 0, uiSquare = 0;
- uint16_t uiCurSum = 0, uiSum = 0;
-
- for (int32_t y = 0; y < MB_WIDTH_LUMA; y++) {
- for (int32_t x = 0; x < MB_WIDTH_LUMA; x++) {
- uint32_t uiDiff = WELS_ABS (pRefY[x] - pSrcY[x]);
- uiSum += uiDiff;
- uiSquare += uiDiff * uiDiff;
-
- uiCurSum += pSrcY[x];
- uiCurSquare += pSrcY[x] * pSrcY[x];
- }
- pRefY += iRefStride;
- pSrcY += iSrcStride;
- }
-
- uiSum = uiSum >> 8;
- pMotionTexture->uiMotionIndex = (uiSquare >> 8) - (uiSum * uiSum);
-
- uiCurSum = uiCurSum >> 8;
- pMotionTexture->uiTextureIndex = (uiCurSquare >> 8) - (uiCurSum * uiCurSum);
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ /dev/null
@@ -1,85 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : AdaptiveQuantization.h
- *
- * \brief : adaptive quantization class of wels video processor class
- *
- * \date : 2011/03/21
- *
- * \description : 1. rewrite the package code of scene change detection class
- *
- */
-
-#ifndef WELSVP_ADAPTIVEQUANTIZATION_H
-#define WELSVP_ADAPTIVEQUANTIZATION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (VarFunc) (uint8_t* pRefY, int32_t iRefStrideY, uint8_t* pSrc, int32_t iSrcStrideY,
- SMotionTextureUnit* pMotionTexture);
-
-typedef VarFunc* PVarFunc;
-
-VarFunc SampleVariance16x16_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-VarFunc SampleVariance16x16_sse2;
-WELSVP_EXTERN_C_END
-#endif
-
-
-class CAdaptiveQuantization : public IStrategy {
- public:
- CAdaptiveQuantization (int32_t iCpuFlag);
- ~CAdaptiveQuantization();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
- EResult Set (int32_t iType, void* pParam);
- EResult Get (int32_t iType, void* pParam);
-
- private:
- void WelsInitVarFunc (PVarFunc& pfVar, int32_t iCpuFlag);
-
- private:
- PVarFunc m_pfVar;
- int32_t m_CPUFlag;
- SAdaptiveQuantizationParam m_sAdaptiveQuantParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/asm/asm_inc.asm
+++ /dev/null
@@ -1,235 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* sse2inc.asm
-;*
-;* Abstract
-;* macro and constant
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-;***********************************************************************
-; Options, for DEBUG
-;***********************************************************************
-
-%if 1
- %define MOVDQ movdqa
-%else
- %define MOVDQ movdqu
-%endif
-
-%if 1
- %define WELSEMMS emms
-%else
- %define WELSEMMS
-%endif
-
-BITS 32
-
-;***********************************************************************
-; Macros
-;***********************************************************************
-
-%macro WELS_EXTERN 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
-%endmacro
-
-%macro WELS_AbsW 2
- pxor %2, %2
- psubw %2, %1
- pmaxsw %1, %2
-%endmacro
-
-%macro MMX_XSwap 4
- movq %4, %2
- punpckh%1 %4, %3
- punpckl%1 %2, %3
-%endmacro
-
-; pOut mm1, mm4, mm5, mm3
-%macro MMX_Trans4x4W 5
- MMX_XSwap wd, %1, %2, %5
- MMX_XSwap wd, %3, %4, %2
- MMX_XSwap dq, %1, %3, %4
- MMX_XSwap dq, %5, %2, %3
-%endmacro
-
-;for TRANSPOSE
-%macro SSE2_XSawp 4
- movdqa %4, %2
- punpckl%1 %2, %3
- punpckh%1 %4, %3
-%endmacro
-
-; in: xmm1, xmm2, xmm3, xmm4 pOut: xmm1, xmm4, xmm5, mm3
-%macro SSE2_Trans4x4D 5
- SSE2_XSawp dq, %1, %2, %5
- SSE2_XSawp dq, %3, %4, %2
- SSE2_XSawp qdq, %1, %3, %4
- SSE2_XSawp qdq, %5, %2, %3
-%endmacro
-
-;in: xmm0, xmm1, xmm2, xmm3 pOut: xmm0, xmm1, xmm3, xmm4
-%macro SSE2_TransTwo4x4W 5
- SSE2_XSawp wd, %1, %2, %5
- SSE2_XSawp wd, %3, %4, %2
- SSE2_XSawp dq, %1, %3, %4
- SSE2_XSawp dq, %5, %2, %3
- SSE2_XSawp qdq, %1, %5, %2
- SSE2_XSawp qdq, %4, %3, %5
-%endmacro
-
-;in: m1, m2, m3, m4, m5, m6, m7, m8
-;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-%macro SSE2_TransTwo8x8B 9
- movdqa %9, %8
- SSE2_XSawp bw, %1, %2, %8
- SSE2_XSawp bw, %3, %4, %2
- SSE2_XSawp bw, %5, %6, %4
- movdqa %6, %9
- movdqa %9, %4
- SSE2_XSawp bw, %7, %6, %4
-
- SSE2_XSawp wd, %1, %3, %6
- SSE2_XSawp wd, %8, %2, %3
- SSE2_XSawp wd, %5, %7, %2
- movdqa %7, %9
- movdqa %9, %3
- SSE2_XSawp wd, %7, %4, %3
-
- SSE2_XSawp dq, %1, %5, %4
- SSE2_XSawp dq, %6, %2, %5
- SSE2_XSawp dq, %8, %7, %2
- movdqa %7, %9
- movdqa %9, %5
- SSE2_XSawp dq, %7, %3, %5
-
- SSE2_XSawp qdq, %1, %8, %3
- SSE2_XSawp qdq, %4, %2, %8
- SSE2_XSawp qdq, %6, %7, %2
- movdqa %7, %9
- movdqa %9, %1
- SSE2_XSawp qdq, %7, %5, %1
- movdqa %5, %9
-%endmacro
-
-;xmm0, xmm6, xmm7, [eax], [ecx]
-;xmm7 = 0, eax = pix1, ecx = pix2, xmm0 save the result
-%macro SSE2_LoadDiff8P 5
- movq %1, %4
- punpcklbw %1, %3
- movq %2, %5
- punpcklbw %2, %3
- psubw %1, %2
-%endmacro
-
-; m2 = m1 + m2, m1 = m1 - m2
-%macro SSE2_SumSub 3
- movdqa %3, %2
- paddw %2, %1
- psubw %1, %3
-%endmacro
-
-
-%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
- mov %3h, %3l
- movd %1, e%3x ; i.e, 1% = eax (=b0)
- pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
- pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
-%endmacro
-
-;copy a dw into a xmm for 8 times
-%macro SSE2_Copy8Times 2
- movd %1, %2
- punpcklwd %1, %1
- pshufd %1, %1, 0
-%endmacro
-
-;copy a db into a xmm for 16 times
-%macro SSE2_Copy16Times 2
- movd %1, %2
- pshuflw %1, %1, 0
- punpcklqdq %1, %1
- packuswb %1, %1
-%endmacro
-
-
-
-;***********************************************************************
-;preprocessor constants
-;***********************************************************************
-;dw 32,32,32,32,32,32,32,32 for xmm
-;dw 32,32,32,32 for mm
-%macro WELS_DW32 1
- pcmpeqw %1,%1
- psrlw %1,15
- psllw %1,5
-%endmacro
-
-;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
-;dw 1, 1, 1, 1 for mm
-%macro WELS_DW1 1
- pcmpeqw %1,%1
- psrlw %1,15
-%endmacro
-
-;all 0 for xmm and mm
-%macro WELS_Zero 1
- pxor %1, %1
-%endmacro
-
-;dd 1, 1, 1, 1 for xmm
-;dd 1, 1 for mm
-%macro WELS_DD1 1
- pcmpeqw %1,%1
- psrld %1,31
-%endmacro
-
-;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
-%macro WELS_DB1 1
- pcmpeqw %1,%1
- psrlw %1,15
- packuswb %1,%1
-%endmacro
-
-
-
-
-
-
--- a/processing/src/asm/cpuid.asm
+++ /dev/null
@@ -1,169 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* cpu_mmx.asm
-;*
-;* Abstract
-;* verify cpuid feature support and cpuid detection
-;*
-;* History
-;* 04/29/2009 Created
-;*
-;*************************************************************************/
-
-bits 32
-
-;******************************************************************************************
-; Macros
-;******************************************************************************************
-
-%macro WELS_EXTERN 1
- %ifdef PREFIX
- global _%1
- %define %1 _%1
- %else
- global %1
- %endif
-%endmacro
-
-;******************************************************************************************
-; Code
-;******************************************************************************************
-
-SECTION .text
-
-; refer to "The IA-32 Intel(R) Architecture Software Developers Manual, Volume 2A A-M"
-; section CPUID - CPU Identification
-
-WELS_EXTERN WelsCPUIdVerify
-ALIGN 16
-;******************************************************************************************
-; int32_t WelsCPUIdVerify()
-;******************************************************************************************
-WelsCPUIdVerify:
- pushfd ; decrease the SP by 4 and load EFLAGS register onto stack, pushfd 32 bit and pushf for 16 bit
- pushfd ; need push 2 EFLAGS, one for processing and the another one for storing purpose
- pop ecx ; get EFLAGS to bit manipulation
- mov eax, ecx ; store into ecx followed
- xor eax, 00200000h ; get ID flag (bit 21) of EFLAGS to directly indicate cpuid support or not
- xor eax, ecx ; get the ID flag bitwise, eax - 0: not support; otherwise: support
- popfd ; store back EFLAGS and keep unchanged for system
- ret
-
-WELS_EXTERN WelsCPUId
-ALIGN 16
-;****************************************************************************************************
-; void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
-;****************************************************************************************************
-WelsCPUId:
- push ebx
- push edi
-
- mov eax, [esp+12] ; operating index
- cpuid ; cpuid
-
- ; processing various information return
- mov edi, [esp+16]
- mov [edi], eax
- mov edi, [esp+20]
- mov [edi], ebx
- mov edi, [esp+24]
- mov [edi], ecx
- mov edi, [esp+28]
- mov [edi], edx
-
- pop edi
- pop ebx
- ret
-
-WELS_EXTERN WelsCPUSupportAVX
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-; int32_t WelsCPUSupportAVX( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportAVX:
- mov eax, [esp+4]
- mov ecx, [esp+8]
-
- ; refer to detection of AVX addressed in INTEL AVX manual document
- and ecx, 018000000H
- cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
- jne avx_not_supported
- ; processor supports AVX instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne avx_not_supported
- mov eax, 1
- ret
-avx_not_supported:
- mov eax, 0
- ret
-
-WELS_EXTERN WelsCPUSupportFMA
-; need call after cpuid=1 and eax, ecx flag got then
-ALIGN 16
-;****************************************************************************************************
-; int32_t WelsCPUSupportFMA( uint32_t eax, uint32_t ecx )
-;****************************************************************************************************
-WelsCPUSupportFMA:
- mov eax, [esp+4]
- mov ecx, [esp+8]
-
- ; refer to detection of FMA addressed in INTEL AVX manual document
- and ecx, 018001000H
- cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
- jne fma_not_supported
- ; processor supports AVX,FMA instructions and XGETBV is enabled by OS
- mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
- XGETBV ; result in EDX:EAX
- and eax, 06H
- cmp eax, 06H ; check OS has enabled both XMM and YMM state support
- jne fma_not_supported
- mov eax, 1
- ret
-fma_not_supported:
- mov eax, 0
- ret
-
-WELS_EXTERN WelsEmms
-ALIGN 16
-;******************************************************************************************
-; void WelsEmms()
-;******************************************************************************************
-WelsEmms:
- emms ; empty mmx technology states
- ret
-
-
-
--- a/processing/src/asm/denoisefilter.asm
+++ /dev/null
@@ -1,263 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* predenoise.asm
-;*
-;* Abstract
-;* denoise for SVC2.1
-;* History
-;* 4/13/2010 Created
-;* 7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-%macro WEIGHT_LINE 9
- movq %2, %9
- punpcklbw %2, %7
- movdqa %8, %2
-
- movdqa %1, %6
- psubusb %1, %8
- psubusb %8, %6
- por %8, %1 ; ABS(curPixel - centerPixel);
-
- movdqa %1, %3
- psubusb %1, %8
-
- pmullw %1, %1
- psrlw %1, 5
- pmullw %2, %1
- paddusw %4, %1
- paddusw %5, %2
-%endmacro
-
-%macro WEIGHT_LINE1_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE2_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- paddw %3, %2
-%endmacro
-
-%macro WEIGHT_LINE3_UV 4
- movdqa %2, %1
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 1
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 2
- punpcklbw %2, %4
- pmullw %2, [sse2_20]
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 3
- punpcklbw %2, %4
- psllw %2, 2
- paddw %3, %2
-
- movdqa %2, %1
- psrldq %2, 4
- punpcklbw %2, %4
- psllw %2, 1
- paddw %3, %2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-; 1 2 3
-; 4 0 5
-; 6 7 8
-; 0: the center point
-%define pushsize 4
-%define pixel esp + pushsize + 4
-%define stride esp + pushsize + 8
-BilateralLumaFilter8_sse2:
- push ebx
-
- pxor xmm7, xmm7
- mov eax, [pixel]
- mov ebx, eax
- movq xmm6, [eax]
- punpcklbw xmm6, xmm7
- movdqa xmm3, [sse2_32]
- pxor xmm4, xmm4 ; nTotWeight
- pxor xmm5, xmm5 ; nSum
-
- dec eax
- mov ecx, [stride]
-
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 4
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 5
-
- sub eax, ecx
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 1
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 2
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 3
-
- lea eax, [eax + ecx * 2]
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax] ; pixel 6
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 1] ; pixel 7
- WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [eax + 2] ; pixel 8
-
- pcmpeqw xmm0, xmm0
- psrlw xmm0, 15
- psllw xmm0, 8
- psubusw xmm0, xmm4
- pmullw xmm0, xmm6
- paddusw xmm5, xmm0
- psrlw xmm5, 8
- packuswb xmm5, xmm5
- movq [ebx], xmm5
-
- pop ebx
- ret
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1 1 2 1 1
-;1 2 4 2 1
-;2 4 20 4 2
-;1 2 4 2 1
-;1 1 2 1 1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
- mov edx, [esp + 4] ; pixels
- mov ecx, [esp + 8] ; stride
-
- mov eax, ecx
- add eax, eax
- sub edx, eax ; pixels - 2 * stride
- sub edx, 2
-
- pxor xmm0, xmm0
- pxor xmm3, xmm3
-
- movdqu xmm1, [edx]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- add edx, eax
- movdqu xmm1, [edx]
- WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx]
- WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
-
- movdqu xmm1, [edx + ecx * 2]
- WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
-
- psrlw xmm3, 6
- packuswb xmm3, xmm3
- movq [edx + 2], xmm3
-
- ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ /dev/null
@@ -1,1225 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* upsampling.asm
-;*
-;* Abstract
-;* SIMD for pixel domain down sampling
-;*
-;* History
-;* 10/22/2009 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
- db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
- db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- ; 2nd part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm1, [esi+16] ; 1st pSrc line + 16
- movq mm2, [esi+24] ; 1st pSrc line + 24
- movq mm3, [esi+ecx+16] ; 2nd pSrc line + 16
- movq mm4, [esi+ecx+24] ; 2nd pSrc line + 24
-
- ; to handle mm1, mm2, mm3, mm4
- pshufw mm5, mm1, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm6, mm5, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm5, mm6 ; d c D C b a B A
- pshufw mm5, mm5, 0d8h ; d c b a D C B A ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm7, mm6, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm6, mm7 ; h g H G f e F E
- pshufw mm6, mm6, 0d8h ; h g f e H G F E ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm1, mm7, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm7, mm1 ; l k L K j i J I
- pshufw mm7, mm7, 0d8h ; l k j i L K J I ; 11011000 B: mm7
-
- pshufw mm1, mm4, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm2, mm1, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm1, mm2 ; p o P O n m N M
- pshufw mm1, mm1, 0d8h ; p o n m P O N M ; 11011000 B: mm1
-
- ; to handle mm5, mm6, mm7, mm1
- movq mm2, mm5
- punpckldq mm2, mm6 ; H G F E D C B A
- punpckhdq mm5, mm6 ; h g f e d c b a
-
- movq mm3, mm7
- punpckldq mm3, mm1 ; P O N M L K J I
- punpckhdq mm7, mm1 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm2, mm5 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm3, mm7 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm2, mm3 ; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
- movq [edi ], mm0
- movq [edi+8], mm2
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A mm1: h H g G f F e E
- ;2nd Line Src: mm2: l L k K j J i I mm3: p P o O n N m M
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+8] ; 1st pSrc line + 8
- movq mm2, [esi+ecx] ; 2nd pSrc line
- movq mm3, [esi+ecx+8] ; 2nd pSrc line + 8
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm4, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm5, mm4, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm4, mm5 ; d c D C b a B A
- pshufw mm4, mm4, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm5, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm6, mm5, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm5, mm6 ; h g H G f e F E
- pshufw mm5, mm5, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- pshufw mm6, mm2, 0d8h ; l L j J k K i I ; 11011000 B
- pshufw mm7, mm6, 04eh ; k K i I l L j J ; 01001110 B
- punpcklbw mm6, mm7 ; l k L K j i J I
- pshufw mm6, mm6, 0d8h ; l k j i L K J I ; 11011000 B: mm6
-
- pshufw mm7, mm3, 0d8h ; p P n N o O m M ; 11011000 B
- pshufw mm0, mm7, 04eh ; o O m M p P n N ; 01001110 B
- punpcklbw mm7, mm0 ; p o P O n m N M
- pshufw mm7, mm7, 0d8h ; p o n m P O N M ; 11011000 B: mm7
-
- ; to handle mm4, mm5, mm6, mm7
- movq mm0, mm4 ;
- punpckldq mm0, mm5 ; H G F E D C B A
- punpckhdq mm4, mm5 ; h g f e d c b a
-
- movq mm1, mm6
- punpckldq mm1, mm7 ; P O N M L K J I
- punpckhdq mm6, mm7 ; p o n m l k j i
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm4 ; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
- pavgb mm1, mm6 ; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movq [edi ], mm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $2 ; (iSrcWidth >> 1) / 4 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 8 bytes
-.xloops:
- ; 1st part horizonal loop: x8 bytes
- ; mem hi<- ->lo
- ;1st Line Src: mm0: d D c C b B a A
- ;2nd Line Src: mm1: h H g G f F e E
- ;=> target:
- ;: H G F E D C B A
- ;: h g f e d c b a
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movq mm0, [esi] ; 1st pSrc line
- movq mm1, [esi+ecx] ; 2nd pSrc line
-
- ; to handle mm0, mm1, mm2, mm3
- pshufw mm2, mm0, 0d8h ; d D b B c C a A ; 11011000 B
- pshufw mm3, mm2, 04eh ; c C a A d D b B ; 01001110 B
- punpcklbw mm2, mm3 ; d c D C b a B A
- pshufw mm2, mm2, 0d8h ; d c b a D C B A ; 11011000 B: mm4
-
- pshufw mm4, mm1, 0d8h ; h H f F g G e E ; 11011000 B
- pshufw mm5, mm4, 04eh ; g G e E h H f F ; 01001110 B
- punpcklbw mm4, mm5 ; h g H G f e F E
- pshufw mm4, mm4, 0d8h ; h g f e H G F E ; 11011000 B: mm5
-
- ; to handle mm2, mm4
- movq mm0, mm2 ;
- punpckldq mm0, mm4 ; H G F E D C B A
- punpckhdq mm2, mm4 ; h g f e d c b a
-
- ; avg within MB horizon width (16 x 2 lines)
- pavgb mm0, mm2 ; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
- pshufw mm1, mm0, 04eh ; 01001110 B
- pavgb mm0, mm1 ; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
- movd [edi], mm0
-
- ; next unit
- lea esi, [esi+8]
- lea edi, [edi+4]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- WELSEMMS
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+16] ; 1st_src_line + 16
- movdqa xmm2, [esi+ecx] ; 2nd_src_line
- movdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm4 high bits
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movdqa xmm0, [esi] ; 1st_src_line
- movdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- ; another implementation for xmm2 high bits
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx32_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
-
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $4 ; (iSrcWidth >> 1) / 16 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 32 bytes
-.xloops:
- ; 1st part horizonal loop: x16 bytes
- ; mem hi<- ->lo
- ;1st Line Src: xmm0: h H g G f F e E d D c C b B a A
- ; xmm1: p P o O n N m M l L k K j J i I
- ;2nd Line Src: xmm2: h H g G f F e E d D c C b B a A
- ; xmm3: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: P O N M L K J I H G F E D C B A
- ;: p o n m l k j i h g f e d c b a
- ;: P .. A
- ;: p .. a
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+16] ; 1st_src_line + 16
- movntdqa xmm2, [esi+ecx] ; 2nd_src_line
- movntdqa xmm3, [esi+ecx+16] ; 2nd_src_line + 16
-
- ; packing & avg
- movdqa xmm4, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm4, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm4, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm4, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm4
-
- movdqa xmm5, xmm1
- pshufb xmm1, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm1
-; psrlw xmm5, 8
- pavgb xmm1, xmm5
-
- movdqa xmm4, xmm2
- pshufb xmm2, xmm7
- pshufb xmm4, xmm6
-; psubb xmm4, xmm2
-; psrlw xmm4, 8
- pavgb xmm2, xmm4
-
- movdqa xmm5, xmm3
- pshufb xmm3, xmm7
- pshufb xmm5, xmm6
-; psubb xmm5, xmm3
-; psrlw xmm5, 8
- pavgb xmm3, xmm5
-
- packuswb xmm0, xmm1
- packuswb xmm2, xmm3
- pavgb xmm0, xmm2
-
- ; write pDst
- movdqa [edi], xmm0
-
- ; next SMB
- lea esi, [esi+32]
- lea edi, [edi+16]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-; void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-; unsigned char* pSrc, const int iSrcStride,
-; const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov edi, [esp+24] ; pDst
- mov edx, [esp+28] ; iDstStride
- mov esi, [esp+32] ; pSrc
- mov ecx, [esp+36] ; iSrcStride
- mov ebp, [esp+44] ; iSrcHeight
-
- sar ebp, $1 ; iSrcHeight >> 1
- movdqa xmm7, [shufb_mask_low] ; mask low
- movdqa xmm6, [shufb_mask_high] ; mask high
-
-.yloops:
- mov eax, [esp+40] ; iSrcWidth
- sar eax, $1 ; iSrcWidth >> 1
- mov ebx, eax ; iDstWidth restored at ebx
- sar eax, $3 ; (iSrcWidth >> 1) / 8 ; loop count = num_of_mb
- neg ebx ; - (iSrcWidth >> 1)
- ; each loop = source bandwidth: 16 bytes
-.xloops:
- ; horizonal loop: x16 bytes by source
- ; mem hi<- ->lo
- ;1st line pSrc: xmm0: h H g G f F e E d D c C b B a A
- ;2nd line pSrc: xmm1: p P o O n N m M l L k K j J i I
- ;=> target:
- ;: H G F E D C B A, P O N M L K J I
- ;: h g f e d c b a, p o n m l k j i
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- movntdqa xmm0, [esi] ; 1st_src_line
- movntdqa xmm1, [esi+ecx] ; 2nd_src_line
-
- ; packing & avg
- movdqa xmm2, xmm0 ; h H g G f F e E d D c C b B a A
- pshufb xmm0, xmm7 ; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
- pshufb xmm2, xmm6 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-; psubb xmm2, xmm0 ; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-; psrlw xmm2, 8 ; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
- pavgb xmm0, xmm2
-
- movdqa xmm3, xmm1
- pshufb xmm1, xmm7
- pshufb xmm3, xmm6
-; psubb xmm3, xmm1
-; psrlw xmm3, 8
- pavgb xmm1, xmm3
-
- pavgb xmm0, xmm1
- packuswb xmm0, xmm1
-
- ; write pDst
- movq [edi], xmm0
-
- ; next SMB
- lea esi, [esi+16]
- lea edi, [edi+8]
-
- dec eax
- jg near .xloops
-
- ; next line
- lea esi, [esi+2*ecx] ; next end of lines
- lea esi, [esi+2*ebx] ; reset to base 0 [- 2 * iDstWidth]
- lea edi, [edi+edx]
- lea edi, [edi+ebx] ; reset to base 0 [- iDstWidth]
-
- dec ebp
- jg near .yloops
-
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-
-
-
-
-WELS_EXTERN GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 32767
- mov eax, [uiScaleX]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm1, eax ; uinc(uiScaleX mod 32767)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 0 uinc -uinc (dword)
- pshufd xmm7, xmm1, 01000100b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 0 vinc -vinc (dword)
- pshufd xmm6, xmm6, 01010000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 40003fffh
- movd xmm5, edx
- punpcklwd xmm5, xmm0 ; 16384 16383
- pshufd xmm5, xmm5, 01000100b ; xmm5: 16384 16383 16384 16383
-
-
-DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshufd xmm4, xmm5, 01010000b ; initial v to 16384 16384 16383 16383
-
-HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 16384 16383 16384 16383
-
-WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- pxor xmm0, xmm0
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
- punpcklwd xmm1, xmm0 ; 000d000c000b000a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmaddwd xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- movdqa xmm0, xmm2
- pmuludq xmm2, xmm1
- psrlq xmm0, 32
- psrlq xmm1, 32
- pmuludq xmm0, xmm1
- paddq xmm2, xmm0
- pshufd xmm1, xmm2, 00001110b
- paddq xmm2, xmm1
- psrlq xmm2, 29
-
- movd eax, xmm2
- inc eax
- shr eax, 1
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
- psllw xmm3, 1
- psrlw xmm3, 1
-
- loop WIDTH
-
-WIDTH_END:
- mov eax, [xInverse]
- shr eax, 15
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg HEIGHT
-
-
-LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 16384
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 15
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
-
-
-
-
-WELS_EXTERN GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2( unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-; unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-; unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
- push ebp
- push esi
- push edi
- push ebx
-%define pushsize 16
-%define localsize 28
-%define pDstData esp + pushsize + localsize + 4
-%define dwDstStride esp + pushsize + localsize + 8
-%define dwDstWidth esp + pushsize + localsize + 12
-%define dwDstHeight esp + pushsize + localsize + 16
-%define pSrcData esp + pushsize + localsize + 20
-%define dwSrcStride esp + pushsize + localsize + 24
-%define dwSrcWidth esp + pushsize + localsize + 28
-%define dwSrcHeight esp + pushsize + localsize + 32
-%define scale esp + 0
-%define uiScaleX esp + pushsize + localsize + 36
-%define uiScaleY esp + pushsize + localsize + 40
-%define tmpHeight esp + 12
-%define yInverse esp + 16
-%define xInverse esp + 20
-%define dstStep esp + 24
- sub esp, localsize
-
- pxor xmm0, xmm0
- mov edx, 65535
- mov eax, [uiScaleX]
- and eax, edx
- mov ebx, eax
- neg ebx
- and ebx, 65535
- movd xmm1, eax ; uinc(uiScaleX mod 65536)
- movd xmm2, ebx ; -uinc
- psllq xmm1, 32
- por xmm1, xmm2 ; 0 uinc 0 -uinc
- pshuflw xmm7, xmm1, 10001000b ; xmm7: uinc -uinc uinc -uinc
-
- mov eax, [uiScaleY]
- and eax, 32767
- mov ebx, eax
- neg ebx
- and ebx, 32767
- movd xmm6, eax ; vinc(uiScaleY mod 32767)
- movd xmm2, ebx ; -vinc
- psllq xmm6, 32
- por xmm6, xmm2 ; 0 vinc 0 -vinc
- pshuflw xmm6, xmm6, 10100000b ; xmm6: vinc vinc -vinc -vinc
-
- mov edx, 80007fffh ; 32768 32767
- movd xmm5, edx
- pshuflw xmm5, xmm5, 01000100b ; 32768 32767 32768 32767
- mov ebx, 16384
-
-
-FAST_DOWNSAMPLE:
-
- mov eax, [dwDstHeight]
- mov edi, [pDstData]
- mov edx, [dwDstStride]
- mov ecx, [dwDstWidth]
- sub edx, ecx
- mov [dstStep], edx ; stride - width
- dec eax
- mov [tmpHeight], eax
- mov eax, 16384
- mov [yInverse], eax
-
- pshuflw xmm4, xmm5, 01010000b
- psrlw xmm4, 1 ; initial v to 16384 16384 16383 16383
-
-FAST_HEIGHT:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
- mov ebp, esi
- add ebp, [dwSrcStride]
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
- dec ecx
-
- movdqa xmm3, xmm5 ; initial u to 32768 32767 32768 32767
-
-FAST_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- movd xmm1, [esi+eax] ; xxxxxxba
- movd xmm2, [ebp+eax] ; xxxxxxdc
- punpcklwd xmm1, xmm2 ; xxxxdcba
- punpcklbw xmm1, xmm0 ; 0d0c0b0a
-
- movdqa xmm2, xmm4 ; xmm2: vv(1-v)(1-v) tmpv
- pmulhuw xmm2, xmm3 ; mul u(1-u)u(1-u) on xmm2
- pmaddwd xmm2, xmm1
- pshufd xmm1, xmm2, 00000001b
- paddd xmm2, xmm1
- movd xmm1, ebx
- paddd xmm2, xmm1
- psrld xmm2, 15
-
- packuswb xmm2, xmm0
- movd eax, xmm2
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- paddw xmm3, xmm7 ; inc u
-
- loop FAST_WIDTH
-
-FAST_WIDTH_END:
- mov eax, [xInverse]
- shr eax, 16
- mov cl, [esi+eax]
- mov [edi], cl
- inc edi
-
- mov eax, [uiScaleY]
- add [yInverse], eax
- add edi, [dstStep]
-
- paddw xmm4, xmm6 ; inc v
- psllw xmm4, 1
- psrlw xmm4, 1
-
- dec dword [tmpHeight]
- jg FAST_HEIGHT
-
-
-FAST_LAST_ROW:
- mov eax, [yInverse]
- mov esi, [pSrcData]
- shr eax, 15
- mul dword [dwSrcStride]
- add esi, eax ; get current row address
-
- mov eax, 32768
- mov [xInverse], eax
- mov ecx, [dwDstWidth]
-
-FAST_LAST_ROW_WIDTH:
- mov eax, [xInverse]
- shr eax, 16
-
- mov al, [esi+eax]
- mov [edi], al
- inc edi
-
- mov eax, [uiScaleX]
- add [xInverse], eax
-
- loop FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef pushsize
-%undef localsize
-%undef pSrcData
-%undef dwSrcWidth
-%undef dwSrcHeight
-%undef dwSrcStride
-%undef pDstData
-%undef dwDstWidth
-%undef dwDstHeight
-%undef dwDstStride
-%undef scale
-%undef uiScaleX
-%undef uiScaleY
-%undef tmpHeight
-%undef yInverse
-%undef xInverse
-%undef dstStep
- ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ /dev/null
@@ -1,145 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* intra_pred.asm
-;*
-;* Abstract
-;* sse2 function for intra predict operations
-;*
-;* History
-;* 18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes: times 16 db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro COPY_16_TIMES 2
- movdqa %2, [%1-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-%macro COPY_16_TIMESS 3
- movdqa %2, [%1+%3-16]
- psrldq %2, 15
- pmuludq %2, [mmx_01bytes]
- pshufd %2, %2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
- lea eax, [eax+ecx*2]
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx+%1], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+%1+0x10], xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- COPY_16_TIMES eax, xmm0
- movdqa [edx], xmm0
- COPY_16_TIMESS eax, xmm0, ecx
- movdqa [edx+0x10], xmm0
-
- SSE2_PRED_H_16X16_TWO_LINE 0x20
- SSE2_PRED_H_16X16_TWO_LINE 0x40
- SSE2_PRED_H_16X16_TWO_LINE 0x60
- SSE2_PRED_H_16X16_TWO_LINE 0x80
- SSE2_PRED_H_16X16_TWO_LINE 0xa0
- SSE2_PRED_H_16X16_TWO_LINE 0xc0
- SSE2_PRED_H_16X16_TWO_LINE 0xe0
-
- ret
-
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
- mov edx, [esp+4] ; pred
- mov eax, [esp+8] ; pRef
- mov ecx, [esp+12] ; stride
-
- sub eax, ecx
- movdqa xmm0, [eax]
-
- movdqa [edx], xmm0
- movdqa [edx+10h], xmm0
- movdqa [edx+20h], xmm0
- movdqa [edx+30h], xmm0
- movdqa [edx+40h], xmm0
- movdqa [edx+50h], xmm0
- movdqa [edx+60h], xmm0
- movdqa [edx+70h], xmm0
- movdqa [edx+80h], xmm0
- movdqa [edx+90h], xmm0
- movdqa [edx+160], xmm0
- movdqa [edx+176], xmm0
- movdqa [edx+192], xmm0
- movdqa [edx+208], xmm0
- movdqa [edx+224], xmm0
- movdqa [edx+240], xmm0
-
- ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ /dev/null
@@ -1,216 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2009-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* pixel_sse2.asm
-;*
-;* Abstract
-;* WelsSampleSad8x8_sse21
-;*
-;* History
-;* 8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movhps xmm0, [eax]
- movhps xmm1, [eax+ebx]
-
- movq xmm2, [ecx]
- movq xmm3, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movhps xmm2, [ecx]
- movhps xmm3, [ecx+edx]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-
-
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and %1, 0x1f|(%3>>1)
-cmp %1, (32-%2)|(%3>>1)
-%endmacro
-
-
-%macro SSE2_GetSad8x4 0
- movq xmm0, [eax]
- movq xmm1, [eax+ebx]
- lea eax, [eax+2*ebx]
- movhps xmm0, [eax]
- movhps xmm1, [eax+ebx]
-
- movq xmm2, [ecx]
- movq xmm3, [ecx+edx]
- lea ecx, [ecx+2*edx]
- movhps xmm2, [ecx]
- movhps xmm3, [ecx+edx]
- psadbw xmm0, xmm2
- psadbw xmm1, xmm3
- paddw xmm6, xmm0
- paddw xmm6, xmm1
-%endmacro
-
-
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-
-WELS_EXTERN WelsSampleSad8x8_sse21
-WelsSampleSad8x8_sse21:
- mov ecx, [esp+12]
- mov edx, ecx
- CACHE_SPLIT_CHECK edx, 8, 64
- jle near .pixel_sad_8x8_nsplit
- push ebx
- push edi
- mov eax, [esp+12]
- mov ebx, [esp+16]
-
- pxor xmm7, xmm7
-
- mov edi, ecx
- and edi, 0x07
- sub ecx, edi
- mov edx, 8
- sub edx, edi
-
- shl edi, 3
- shl edx, 3
- movd xmm5, edi
- movd xmm6, edx
- mov edi, 8
- add edi, ecx
- mov edx, [esp+24]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- lea edi, [edi+2*edx]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- lea edi, [edi+2*edx]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- lea edi, [edi+2*edx]
-
- movq xmm0, [eax]
- movhps xmm0, [eax+ebx]
-
- movq xmm1, [ecx]
- movq xmm2, [edi]
- movhps xmm1, [ecx+edx]
- movhps xmm2, [edi+edx]
- psrlq xmm1, xmm5
- psllq xmm2, xmm6
- por xmm1, xmm2
-
- psadbw xmm0, xmm1
- paddw xmm7, xmm0
-
- movhlps xmm0, xmm7
- paddw xmm0, xmm7
- movd eax, xmm0
- pop edi
- jmp .return
-.pixel_sad_8x8_nsplit:
- push ebx
- mov eax, [esp+8]
- mov ebx, [esp+12]
- mov edx, [esp+20]
- pxor xmm6, xmm6
- SSE2_GetSad8x4
- lea eax, [eax+2*ebx]
- lea ecx, [ecx+2*edx]
- SSE2_GetSad8x4
- movhlps xmm0, xmm6
- paddw xmm0, xmm6
- movd eax, xmm0
-.return:
- pop ebx
- ret
\ No newline at end of file
--- a/processing/src/asm/vaa.asm
+++ /dev/null
@@ -1,1589 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2010-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;* vaa.asm
-;*
-;* Abstract
-;* sse2 for pVaa routines
-;*
-;* History
-;* 04/14/2010 Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
-; movdqa %1, %2
-; punpcklbw %1, %3
-; punpckhbw %2, %3
-; paddw %1, %2
-; pmaddwd %1, %4
-; pshufd %2, %1, 04Eh ; 01001110 B
-; paddd %1, %2
-; pshufd %2, %1, 0B1h ; 10110001 B
-; paddd %1, %2
-;%endmacro ; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
-%endmacro ; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, [esi+ebx]
- movdqa xmm4, [edi+ebx]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea esi, [esi+ebx*2]
- lea edi, [edi+ebx*2]
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
-
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
-
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
-
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-%macro WELS_SAD_SD_MAD_16x1_SSE2 4
-%define sad_reg %1
-%define sum_cur_reg %2
-%define sum_ref_reg %3
-%define mad_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-
-%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
-%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
-%endmacro
-
-%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
-%define sad_reg %1
-%define sum_reg %2
-%define mad_reg %3
-%define sqdiff_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
-
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
-
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
-
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
-
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-; dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
- push esi
- push edi
- push ebp
- push ebx
- push edx
-
- mov esi, [esp+24]
- mov edi, [esp+28]
- mov ebx, [esp+32]
- mov ecx, [esp+36]
- mov edx, [esp+40]
- pxor xmm0, xmm0
-.hloop:
- mov eax, ebx
- mov ebp, $0
-.wloop:
- movdqa xmm1, [esi+ebp]
- movdqa xmm2, [edi+ebp]
- psadbw xmm1, xmm2
- pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- add ebp, 010h
- dec eax
- jnz near .wloop
- lea esi, [esi+edx]
- lea edi, [edi+edx]
- dec ecx
- jnz near .hloop
-
- movd eax, xmm0
- pop edx
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:
- push esi
- push edi
- push ebx
-
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
-
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
-
- pxor xmm7, xmm7
- movdqu SUM, xmm7
-
-.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
-
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
-
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
-
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
-
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
-
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
-
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
-
- add esp, 16
- pop ebx
- pop edi
- pop esi
-
- ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
-
- mov ebx, ecx
- sal ebx, $1 ; linesize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; linesize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; linesize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+8], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+24], xmm0
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low word truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
- push ebx
- push edx
- push esi
- push edi
- push ebp
-
- mov ebp, esp
- and ebp, 0fh
- sub esp, ebp
- sub esp, 32
- %define PUSH_SIZE 52 ; 20 + 32
-
- mov esi, [esp+ebp+PUSH_SIZE+4] ; data_y
- mov ecx, [esp+ebp+PUSH_SIZE+8] ; linesize
-
- mov ebx, ecx
- sal ebx, $1 ; linesize x 2 [ebx]
- mov edx, ebx
- add edx, ecx ; linesize x 3 [edx]
- mov eax, ebx
- sal eax, $1 ; linesize x 4 [eax]
-
- pxor xmm7, xmm7
-
- ; loops
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+8], xmm1
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
- movq [esp+16], xmm0
-
- lea esi, [esi+eax]
- VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
- movq [esp+24], xmm1
-
- movdqa xmm0, [esp] ; block 0~7
- movdqa xmm1, [esp+16] ; block 8~15
- movdqa xmm2, xmm0
- paddw xmm0, xmm1
- SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
-
- pmullw xmm1, xmm1
- pmullw xmm2, xmm2
- movdqa xmm3, xmm1
- movdqa xmm4, xmm2
- punpcklwd xmm1, xmm7
- punpckhwd xmm3, xmm7
- punpcklwd xmm2, xmm7
- punpckhwd xmm4, xmm7
- paddd xmm1, xmm2
- paddd xmm3, xmm4
- paddd xmm1, xmm3
- pshufd xmm2, xmm1, 01Bh
- paddd xmm1, xmm2
- pshufd xmm2, xmm1, 0B1h
- paddd xmm1, xmm2
-
- movd ebx, xmm0
- and ebx, 0ffffh ; effective low work truncated
- mov ecx, ebx
- imul ebx, ecx
- sar ebx, $4
- movd eax, xmm1
- sub eax, ebx
-
- %undef PUSH_SIZE
- add esp, 32
- add esp, ebp
- pop ebp
- pop edi
- pop esi
- pop edx
- pop ebx
- ret
-%endif
-
-
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define ref_orig esp + pushsize + 4
-%define cur_orig esp + pushsize + 8
-%define iPicStride esp + pushsize + 12
-%define gom_pixel_num esp + pushsize + 16
-%define pSum esp + pushsize + 20
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [ref_orig]
- mov edi, [cur_orig]
- mov ebx, [iPicStride]
- mov eax, [gom_pixel_num]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0
-mb_width_loop_p:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_p:
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- psadbw xmm1, xmm2
- paddd xmm0, xmm1
- add esi, 16
- add edi, 16
- cmp esi, edx
- jl gom_row_loop_p
-
- sub esi, eax
- sub edi, eax
- add esi, ebx
- add edi, ebx
- loop mb_width_loop_p
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddd xmm1, xmm0
- movd eax, xmm1
- mov edx, [pSum] ; pSum
- add [edx], eax
-
-%undef ref_orig
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define cur_orig esp + pushsize + 4
-%define iPicStride esp + pushsize + 8
-%define gom_pixel_num esp + pushsize + 12
-%define pSum esp + pushsize + 16
-%define pSqrSum esp + pushsize + 20
-%define pushsize 8
- push esi
- push ebx
- mov esi, [cur_orig]
- mov eax, [gom_pixel_num]
- mov ebx, [iPicStride]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0 ; zero
- pxor xmm1, xmm1 ; sum
- pxor xmm2, xmm2 ; sqr sum
-mb_width_loop_i:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_i:
- movdqa xmm3, [esi]
- movdqa xmm4, xmm3
- psadbw xmm4, xmm0
- paddd xmm1, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm4, xmm4
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- paddd xmm2, xmm4
- add esi, 16
- cmp esi, edx
- jl gom_row_loop_i
-
- sub esi, eax
- add esi, ebx
- loop mb_width_loop_i
-
- movdqa xmm3, xmm1
- psrldq xmm3, 8
- paddd xmm1, xmm3
- movd eax, xmm1
- mov edx, [pSum]
- add [edx], eax
-
- movdqa xmm3, xmm2
- psrldq xmm3, 8
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psrldq xmm3, 4
- paddd xmm2, xmm3
- movd eax, xmm2
- mov edx, [pSqrSum]
- add [edx], eax
-
-
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pSqrSum
-%undef pushsize
- pop ebx
- pop esi
- ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define cur_data esp + pushsize + 4
-%define ref_data esp + pushsize + 8
-%define iPicWidth esp + pushsize + 12
-%define iPicHeight esp + pushsize + 16
-%define iPicStride esp + pushsize + 20
-%define psadframe esp + pushsize + 24
-%define psad8x8 esp + pushsize + 28
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
-width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz width_loop
-
- pop edi
- pop esi
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
-
-
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define localsize 8
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
-var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
-
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
-
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
-
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz var_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz var_height_loop
-
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
-
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
-
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
-
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- dec ecx
- jnz sqdiff_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
-
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef tmp_sadframe
-%undef pushsize
-%undef localsize
- ret
-
-
-
-
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define p_sd8x8 esp + pushsize + localsize + 32
-%define p_mad8x8 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_ecx esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
-bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
-
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
-
-
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
-
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
-
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
-
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
-
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz bgd_height_loop
-
- mov edx, [psadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define localsize 16
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define p_sd8x8 esp + pushsize + localsize + 44
-%define p_mad8x8 esp + pushsize + localsize + 48
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define tmp_ecx esp + 12
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
-
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
-sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
-sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
-
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
-
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
-
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
-
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
-
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
-
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
-
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
-
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
-
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
-
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
-
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
-
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
-
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
--- a/processing/src/backgounddetection/BackgroundDetection.cpp
+++ /dev/null
@@ -1,389 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "BackgroundDetection.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define LOG2_BGD_OU_SIZE (4)
-#define LOG2_BGD_OU_SIZE_UV (LOG2_BGD_OU_SIZE-1)
-#define BGD_OU_SIZE (1<<LOG2_BGD_OU_SIZE)
-#define BGD_OU_SIZE_UV (BGD_OU_SIZE>>1)
-#define BGD_THD_SAD (2*BGD_OU_SIZE*BGD_OU_SIZE)
-#define BGD_THD_ASD_UV (4*BGD_OU_SIZE_UV)
-#define LOG2_MB_SIZE (4)
-#define OU_SIZE_IN_MB (BGD_OU_SIZE >> 4)
-#define Q_FACTOR (8)
-#define BGD_DELTA_QP_THD (3)
-
-#define OU_LEFT (0x01)
-#define OU_RIGHT (0x02)
-#define OU_TOP (0x04)
-#define OU_BOTTOM (0x08)
-
-CBackgroundDetection::CBackgroundDetection (int32_t iCpuFlag) {
- m_eMethod = METHOD_BACKGROUND_DETECTION;
- WelsMemset (&m_BgdParam, 0, sizeof (m_BgdParam));
- m_iLargestFrameSize = 0;
-}
-
-CBackgroundDetection::~CBackgroundDetection() {
- FreeOUArrayMemory();
-}
-
-EResult CBackgroundDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- EResult eReturn = RET_INVALIDPARAM;
-
- if (pSrcPixMap == NULL || pRefPixMap == NULL)
- return eReturn;
-
- m_BgdParam.pCur[0] = (uint8_t*)pSrcPixMap->pPixel[0];
- m_BgdParam.pCur[1] = (uint8_t*)pSrcPixMap->pPixel[1];
- m_BgdParam.pCur[2] = (uint8_t*)pSrcPixMap->pPixel[2];
- m_BgdParam.pRef[0] = (uint8_t*)pRefPixMap->pPixel[0];
- m_BgdParam.pRef[1] = (uint8_t*)pRefPixMap->pPixel[1];
- m_BgdParam.pRef[2] = (uint8_t*)pRefPixMap->pPixel[2];
- m_BgdParam.iBgdWidth = pSrcPixMap->sRect.iRectWidth;
- m_BgdParam.iBgdHeight = pSrcPixMap->sRect.iRectHeight;
- m_BgdParam.iStride[0] = pSrcPixMap->iStride[0];
- m_BgdParam.iStride[1] = pSrcPixMap->iStride[1];
- m_BgdParam.iStride[2] = pSrcPixMap->iStride[2];
-
- int32_t iCurFrameSize = m_BgdParam.iBgdWidth * m_BgdParam.iBgdHeight;
- if (m_BgdParam.pOU_array == NULL || iCurFrameSize > m_iLargestFrameSize) {
- FreeOUArrayMemory();
- m_BgdParam.pOU_array = AllocateOUArrayMemory (m_BgdParam.iBgdWidth, m_BgdParam.iBgdHeight);
- m_iLargestFrameSize = iCurFrameSize;
- }
-
- if (m_BgdParam.pOU_array == NULL)
- return eReturn;
-
- BackgroundDetection (&m_BgdParam);
-
- return RET_SUCCESS;
-}
-
-EResult CBackgroundDetection::Set (int32_t iType, void* pParam) {
- if (pParam == NULL) {
- return RET_INVALIDPARAM;
- }
-
- SBGDInterface* pInterface = (SBGDInterface*)pParam;
-
- m_BgdParam.pBackgroundMbFlag = (int8_t*)pInterface->pBackgroundMbFlag;
- m_BgdParam.pCalcRes = pInterface->pCalcRes;
-
- return RET_SUCCESS;
-}
-
-inline SBackgroundOU* CBackgroundDetection::AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight) {
- int32_t iMaxOUWidth = (BGD_OU_SIZE - 1 + iWidth) >> LOG2_BGD_OU_SIZE;
- int32_t iMaxOUHeight = (BGD_OU_SIZE - 1 + iHeight) >> LOG2_BGD_OU_SIZE;
- return (SBackgroundOU*)WelsMalloc (iMaxOUWidth * iMaxOUHeight * sizeof (SBackgroundOU));
-}
-
-inline void CBackgroundDetection::FreeOUArrayMemory() {
- _SafeFree (m_BgdParam.pOU_array);
-}
-
-void CBackgroundDetection::GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
- SBackgroundOU* pBgdOU) {
- int32_t iSubSD[4];
- uint8_t iSubMAD[4];
- int32_t iSubSAD[4];
-
- uint8_t (*pMad8x8)[4];
- int32_t (*pSad8x8)[4];
- int32_t (*pSd8x8)[4];
-
- pSad8x8 = sVaaCalcInfo->pSad8x8;
- pMad8x8 = sVaaCalcInfo->pMad8x8;
- pSd8x8 = sVaaCalcInfo->pSumOfDiff8x8;
-
- iSubSAD[0] = pSad8x8[iMbIndex][0];
- iSubSAD[1] = pSad8x8[iMbIndex][1];
- iSubSAD[2] = pSad8x8[iMbIndex][2];
- iSubSAD[3] = pSad8x8[iMbIndex][3];
-
- iSubSD[0] = pSd8x8[iMbIndex][0];
- iSubSD[1] = pSd8x8[iMbIndex][1];
- iSubSD[2] = pSd8x8[iMbIndex][2];
- iSubSD[3] = pSd8x8[iMbIndex][3];
-
- iSubMAD[0] = pMad8x8[iMbIndex][0];
- iSubMAD[1] = pMad8x8[iMbIndex][1];
- iSubMAD[2] = pMad8x8[iMbIndex][2];
- iSubMAD[3] = pMad8x8[iMbIndex][3];
-
- pBgdOU->iSD = iSubSD[0] + iSubSD[1] + iSubSD[2] + iSubSD[3];
- pBgdOU->iSAD = iSubSAD[0] + iSubSAD[1] + iSubSAD[2] + iSubSAD[3];
- pBgdOU->iSD = WELS_ABS (pBgdOU->iSD);
-
- // get the max absolute difference (MAD) of OU and min value of the MAD of sub-blocks of OU
- pBgdOU->iMAD = WELS_MAX (WELS_MAX (iSubMAD[0], iSubMAD[1]), WELS_MAX (iSubMAD[2], iSubMAD[3]));
- pBgdOU->iMinSubMad = WELS_MIN (WELS_MIN (iSubMAD[0], iSubMAD[1]), WELS_MIN (iSubMAD[2], iSubMAD[3]));
-
- // get difference between the max and min SD of the SDs of sub-blocks of OU
- pBgdOU->iMaxDiffSubSd = WELS_MAX (WELS_MAX (iSubSD[0], iSubSD[1]), WELS_MAX (iSubSD[2], iSubSD[3])) -
- WELS_MIN (WELS_MIN (iSubSD[0], iSubSD[1]), WELS_MIN (iSubSD[2], iSubSD[3]));
-}
-
-void CBackgroundDetection::ForegroundBackgroundDivision (vBGDParam* pBgdParam) {
- int32_t iPicWidthInOU = pBgdParam->iBgdWidth >> LOG2_BGD_OU_SIZE;
- int32_t iPicHeightInOU = pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
- int32_t iPicWidthInMb = (15 + pBgdParam->iBgdWidth) >> 4;
-
- SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
-
- for (int32_t j = 0; j < iPicHeightInOU; j ++) {
- for (int32_t i = 0; i < iPicWidthInOU; i++) {
- GetOUParameters (pBgdParam->pCalcRes, (j * iPicWidthInMb + i) << (LOG2_BGD_OU_SIZE - LOG2_MB_SIZE), iPicWidthInMb,
- pBackgroundOU);
-
- pBackgroundOU->iBackgroundFlag = 0;
- if (pBackgroundOU->iMAD > 63) {
- pBackgroundOU++;
- continue;
- }
- if ((pBackgroundOU->iMaxDiffSubSd <= pBackgroundOU->iSAD >> 3
- || pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR))
- && pBackgroundOU->iSAD < (BGD_THD_SAD << 1)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
- if (pBackgroundOU->iSAD <= BGD_OU_SIZE * Q_FACTOR) {
- pBackgroundOU->iBackgroundFlag = 1;
- } else {
- pBackgroundOU->iBackgroundFlag = pBackgroundOU->iSAD < BGD_THD_SAD ?
- (pBackgroundOU->iSD < (pBackgroundOU->iSAD * 3) >> 2) :
- (pBackgroundOU->iSD << 1 < pBackgroundOU->iSAD);
- }
- }
- pBackgroundOU++;
- }
- }
-}
-inline int32_t CBackgroundDetection::CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride) {
- int32_t ASD = 0;
- int32_t idx;
- for (idx = 0; idx < BGD_OU_SIZE_UV; idx++) {
- ASD += *pOriCur - *pOriRef;
- pOriRef += iStride;
- pOriCur += iStride;
- }
- return WELS_ABS (ASD);
-}
-
-inline bool_t CBackgroundDetection::ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
- SBackgroundOU* pOUNeighbours[]) {
- SBackgroundOU* pOU_L = pOUNeighbours[0];
- SBackgroundOU* pOU_R = pOUNeighbours[1];
- SBackgroundOU* pOU_U = pOUNeighbours[2];
- SBackgroundOU* pOU_D = pOUNeighbours[3];
-
- if (pBackgroundOU->iMAD > pBackgroundOU->iMinSubMad << 1) {
- int32_t iMaxNbrForegroundMad;
- int32_t iMaxNbrBackgroundMad;
- int32_t aBackgroundMad[4];
- int32_t aForegroundMad[4];
-
- aForegroundMad[0] = (pOU_L->iBackgroundFlag - 1) & pOU_L->iMAD;
- aForegroundMad[1] = (pOU_R->iBackgroundFlag - 1) & pOU_R->iMAD;
- aForegroundMad[2] = (pOU_U->iBackgroundFlag - 1) & pOU_U->iMAD;
- aForegroundMad[3] = (pOU_D->iBackgroundFlag - 1) & pOU_D->iMAD;
- iMaxNbrForegroundMad = WELS_MAX (WELS_MAX (aForegroundMad[0], aForegroundMad[1]), WELS_MAX (aForegroundMad[2],
- aForegroundMad[3]));
-
- aBackgroundMad[0] = ((!pOU_L->iBackgroundFlag) - 1) & pOU_L->iMAD;
- aBackgroundMad[1] = ((!pOU_R->iBackgroundFlag) - 1) & pOU_R->iMAD;
- aBackgroundMad[2] = ((!pOU_U->iBackgroundFlag) - 1) & pOU_U->iMAD;
- aBackgroundMad[3] = ((!pOU_D->iBackgroundFlag) - 1) & pOU_D->iMAD;
- iMaxNbrBackgroundMad = WELS_MAX (WELS_MAX (aBackgroundMad[0], aBackgroundMad[1]), WELS_MAX (aBackgroundMad[2],
- aBackgroundMad[3]));
-
- return ((iMaxNbrForegroundMad > pBackgroundOU->iMinSubMad << 2) || (pBackgroundOU->iMAD > iMaxNbrBackgroundMad << 1
- && pBackgroundOU->iMAD <= (iMaxNbrForegroundMad * 3) >> 1));
- }
- return 0;
-}
-
-inline bool_t CBackgroundDetection::ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags,
- int32_t iStartSamplePos, int32_t iPicStrideUV, vBGDParam* pBgdParam) {
- static const int8_t kaOUPos[4] = {OU_LEFT, OU_RIGHT, OU_TOP, OU_BOTTOM};
- int32_t aEdgeOffset[4] = {0, BGD_OU_SIZE_UV - 1, 0, iPicStrideUV* (BGD_OU_SIZE_UV - 1)};
- int32_t iStride[4] = {iPicStrideUV, iPicStrideUV, 1, 1};
-
- // V component first, high probability because V stands for red color and human skin colors have more weight on this component
- for (int32_t i = 0; i < 4; i++) {
- if (iNeighbourForegroundFlags & kaOUPos[i]) {
- uint8_t* pRefC = pBgdParam->pRef[2] + iStartSamplePos + aEdgeOffset[i];
- uint8_t* pCurC = pBgdParam->pCur[2] + iStartSamplePos + aEdgeOffset[i];
- if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
- return 1;
- }
- }
- }
- // U component, which stands for blue color, low probability
- for (int32_t i = 0; i < 4; i++) {
- if (iNeighbourForegroundFlags & kaOUPos[i]) {
- uint8_t* pRefC = pBgdParam->pRef[1] + iStartSamplePos + aEdgeOffset[i];
- uint8_t* pCurC = pBgdParam->pCur[1] + iStartSamplePos + aEdgeOffset[i];
- if (CalculateAsdChromaEdge (pRefC, pCurC, iStride[i]) > BGD_THD_ASD_UV) {
- return 1;
- }
- }
- }
-
- return 0;
-}
-
-inline void CBackgroundDetection::ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[],
- vBGDParam* pBgdParam, int32_t iChromaSampleStartPos) {
- int32_t iPicStrideUV = pBgdParam->iStride[1];
- int32_t iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
- pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
-
- if (pBackgroundOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
- switch (iSumNeighBackgroundFlags) {
- case 0:
- case 1:
- pBackgroundOU->iBackgroundFlag = 0;
- break;
- case 2:
- case 3:
- pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
-
- // chroma component check
- if (pBackgroundOU->iBackgroundFlag == 1) {
- int8_t iNeighbourForegroundFlags = !pOUNeighbours[0]->iBackgroundFlag | ((!pOUNeighbours[1]->iBackgroundFlag) << 1)
- | ((!pOUNeighbours[2]->iBackgroundFlag) << 2) | ((!pOUNeighbours[3]->iBackgroundFlag) << 3);
- pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Chroma (iNeighbourForegroundFlags, iChromaSampleStartPos,
- iPicStrideUV, pBgdParam);
- }
- break;
- default:
- break;
- }
- }
-}
-inline void CBackgroundDetection::BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]) {
- if (pBackgroundOU->iMaxDiffSubSd <= (BGD_OU_SIZE * Q_FACTOR)) { //BGD_OU_SIZE*BGD_OU_SIZE>>2
- int32_t iSumNeighBackgroundFlags = pOUNeighbours[0]->iBackgroundFlag + pOUNeighbours[1]->iBackgroundFlag +
- pOUNeighbours[2]->iBackgroundFlag + pOUNeighbours[3]->iBackgroundFlag;
- int32_t sumNbrBGsad = (pOUNeighbours[0]->iSAD & (-pOUNeighbours[0]->iBackgroundFlag)) + (pOUNeighbours[2]->iSAD &
- (-pOUNeighbours[2]->iBackgroundFlag))
- + (pOUNeighbours[1]->iSAD & (-pOUNeighbours[1]->iBackgroundFlag)) + (pOUNeighbours[3]->iSAD &
- (-pOUNeighbours[3]->iBackgroundFlag));
- if (pBackgroundOU->iSAD * iSumNeighBackgroundFlags <= (3 * sumNbrBGsad) >> 1) {
- if (iSumNeighBackgroundFlags == 4) {
- pBackgroundOU->iBackgroundFlag = 1;
- } else {
- if ((pOUNeighbours[0]->iBackgroundFlag & pOUNeighbours[1]->iBackgroundFlag)
- || (pOUNeighbours[2]->iBackgroundFlag & pOUNeighbours[3]->iBackgroundFlag)) {
- pBackgroundOU->iBackgroundFlag = !ForegroundDilation23Luma (pBackgroundOU, pOUNeighbours);
- }
- }
- }
- }
-}
-
-inline void CBackgroundDetection::SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb,
- int32_t iBackgroundMbFlag) {
- *pBackgroundMbFlag = iBackgroundMbFlag;
-}
-
-inline void CBackgroundDetection::UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag,
- int32_t iPicWidthInOU, int32_t iPicWidthInMb) {
- if (pCurOU->iSAD > BGD_OU_SIZE * Q_FACTOR) {
- SBackgroundOU* pOU_L = pCurOU - 1;
- SBackgroundOU* pOU_R = pCurOU + 1;
- SBackgroundOU* pOU_U = pCurOU - iPicWidthInOU;
- SBackgroundOU* pOU_D = pCurOU + iPicWidthInOU;
- if (pOU_L->iBackgroundFlag + pOU_R->iBackgroundFlag + pOU_U->iBackgroundFlag + pOU_D->iBackgroundFlag <= 1) {
- SetBackgroundMbFlag (pBackgroundMbFlag, iPicWidthInMb, 0);
- pCurOU->iBackgroundFlag = 0;
- }
- }
-}
-
-void CBackgroundDetection::ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam) {
- int32_t iPicStrideUV = pBgdParam->iStride[1];
- int32_t iPicWidthInOU = pBgdParam->iBgdWidth >> LOG2_BGD_OU_SIZE;
- int32_t iPicHeightInOU = pBgdParam->iBgdHeight >> LOG2_BGD_OU_SIZE;
- int32_t iOUStrideUV = iPicStrideUV << (LOG2_BGD_OU_SIZE - 1);
- int32_t iPicWidthInMb = (15 + pBgdParam->iBgdWidth) >> 4;
-
- SBackgroundOU* pBackgroundOU = pBgdParam->pOU_array;
- int8_t* pVaaBackgroundMbFlag = (int8_t*)pBgdParam->pBackgroundMbFlag;
- SBackgroundOU* pOUNeighbours[4];//0: left; 1: right; 2: top; 3: bottom
-
- pBackgroundOU = pBgdParam->pOU_array;
- pOUNeighbours[2] = pBackgroundOU;//top OU
- for (int32_t j = 0; j < iPicHeightInOU; j ++) {
- int8_t* pRowSkipFlag = pVaaBackgroundMbFlag;
- pOUNeighbours[0] = pBackgroundOU;//left OU
- pOUNeighbours[3] = pBackgroundOU + (iPicWidthInOU & ((j == iPicHeightInOU - 1) - 1)); //bottom OU
- for (int32_t i = 0; i < iPicWidthInOU; i++) {
- pOUNeighbours[1] = pBackgroundOU + (i < iPicWidthInOU - 1); //right OU
-
- if (pBackgroundOU->iBackgroundFlag)
- ForegroundDilation (pBackgroundOU, pOUNeighbours, pBgdParam, j * iOUStrideUV + (i << LOG2_BGD_OU_SIZE_UV));
- else
- BackgroundErosion (pBackgroundOU, pOUNeighbours);
-
- // check the up OU
- if (j > 1 && i > 0 && i < iPicWidthInOU - 1 && pOUNeighbours[2]->iBackgroundFlag == 1) {
- UpperOUForegroundCheck (pOUNeighbours[2], pRowSkipFlag - OU_SIZE_IN_MB * iPicWidthInMb, iPicWidthInOU, iPicWidthInMb);
- }
-
- SetBackgroundMbFlag (pRowSkipFlag, iPicWidthInMb, pBackgroundOU->iBackgroundFlag);
-
- // preparation for the next OU
- pRowSkipFlag += OU_SIZE_IN_MB;
- pOUNeighbours[0] = pBackgroundOU;
- pOUNeighbours[2]++;
- pOUNeighbours[3]++;
- pBackgroundOU++;
- }
- pOUNeighbours[2] = pBackgroundOU - iPicWidthInOU;
- pVaaBackgroundMbFlag += OU_SIZE_IN_MB * iPicWidthInMb;
- }
-}
-
-void CBackgroundDetection::BackgroundDetection (vBGDParam* pBgdParam) {
- // 1st step: foreground/background coarse division
- ForegroundBackgroundDivision (pBgdParam);
-
- // 2nd step: foreground dilation and background erosion
- ForegroundDilationAndBackgroundErosion (pBgdParam);
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/backgounddetection/BackgroundDetection.h
+++ /dev/null
@@ -1,106 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : BackgroundDetection.h
- *
- * \brief : background detection class of wels video processor class
- *
- * \date : 2011/03/17
- *
- * \description : 1. rewrite the package code of background detection class
- *
- */
-
-#ifndef WELSVP_BACKGROUNDDETECTION_H
-#define WELSVP_BACKGROUNDDETECTION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef struct {
- int32_t iBackgroundFlag;
- int32_t iSAD;
- int32_t iSD;
- int32_t iMAD;
- int32_t iMinSubMad;
- int32_t iMaxDiffSubSd;
-} SBackgroundOU;
-
-class CBackgroundDetection : public IStrategy {
- public:
- CBackgroundDetection (int32_t iCpuFlag);
- ~CBackgroundDetection();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
- EResult Set (int32_t iType, void* pParam);
-
- private:
- struct vBGDParam {
- uint8_t* pCur[3];
- uint8_t* pRef[3];
- int32_t iBgdWidth;
- int32_t iBgdHeight;
- int32_t iStride[3];
- SBackgroundOU* pOU_array;
- int8_t* pBackgroundMbFlag;
- SVAACalcResult* pCalcRes;
- } m_BgdParam;
-
- int32_t m_iLargestFrameSize;
-
- private:
- inline SBackgroundOU* AllocateOUArrayMemory (int32_t iWidth, int32_t iHeight);
- inline void FreeOUArrayMemory();
- inline int32_t CalculateAsdChromaEdge (uint8_t* pOriRef, uint8_t* pOriCur, int32_t iStride);
- inline bool_t ForegroundDilation23Luma (SBackgroundOU* pBackgroundOU,
- SBackgroundOU* pOUNeighbours[]); //Foreground_Dilation_2_3_Luma
- inline bool_t ForegroundDilation23Chroma (int8_t iNeighbourForegroundFlags, int32_t iStartSamplePos,
- int32_t iPicStrideUV, vBGDParam* pBgdParam);//Foreground_Dilation_2_3_Chroma
- inline void ForegroundDilation (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[], vBGDParam* pBgdParam,
- int32_t iChromaSampleStartPos);
- inline void BackgroundErosion (SBackgroundOU* pBackgroundOU, SBackgroundOU* pOUNeighbours[]);
- inline void SetBackgroundMbFlag (int8_t* pBackgroundMbFlag, int32_t iPicWidthInMb, int32_t iBackgroundMbFlag);
- inline void UpperOUForegroundCheck (SBackgroundOU* pCurOU, int8_t* pBackgroundMbFlag, int32_t iPicWidthInOU,
- int32_t iPicWidthInMb);
-
- void GetOUParameters (SVAACalcResult* sVaaCalcInfo, int32_t iMbIndex, int32_t iMbWidth,
- SBackgroundOU* pBackgroundOU);
- void ForegroundBackgroundDivision (vBGDParam* pBgdParam);
- void ForegroundDilationAndBackgroundErosion (vBGDParam* pBgdParam);
- void BackgroundDetection (vBGDParam* pBgdParam);
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/WelsFrameWork.cpp
+++ /dev/null
@@ -1,301 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "WelsFrameWork.h"
-#include "cpu.h"
-#include "../denoise/denoise.h"
-#include "../downsample/downsample.h"
-#include "../scenechangedetection/SceneChangeDetection.h"
-#include "../vaacalc/vaacalculation.h"
-#include "../backgounddetection/BackgroundDetection.h"
-#include "../adaptivequantization/AdaptiveQuantization.h"
-#include "../complexityanalysis/ComplexityAnalysis.h"
-#include "../imagerotate/imagerotate.h"
-
-
-/* interface API implement */
-
-EResult WELSAPI CreateVpInterface (void** ppCtx, int iVersion) {
- if (iVersion & 0x8000)
- return nsWelsVP::CreateSpecificVpInterface ((IWelsVP**)ppCtx);
- else if (iVersion & 0x7fff)
- return nsWelsVP::CreateSpecificVpInterface ((IWelsVPc**)ppCtx);
- else
- return RET_INVALIDPARAM;
-}
-
-EResult WELSAPI DestroyVpInterface (void* pCtx, int iVersion) {
- if (iVersion & 0x8000)
- return nsWelsVP::DestroySpecificVpInterface ((IWelsVP*)pCtx);
- else if (iVersion & 0x7fff)
- return nsWelsVP::DestroySpecificVpInterface ((IWelsVPc*)pCtx);
- else
- return RET_INVALIDPARAM;
-}
-
-WELSVP_NAMESPACE_BEGIN
-
-///////////////////////////////////////////////////////////////////////
-
-EResult CreateSpecificVpInterface (IWelsVP** ppCtx) {
- EResult eReturn = RET_FAILED;
-
- CVpFrameWork* pFr = new CVpFrameWork (1, eReturn);
- if (pFr) {
- *ppCtx = (IWelsVP*)pFr;
- eReturn = RET_SUCCESS;
- }
-
- return eReturn;
-}
-
-EResult DestroySpecificVpInterface (IWelsVP* pCtx) {
- _SafeDelete (pCtx);
-
- return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-CVpFrameWork::CVpFrameWork (uint32_t uiThreadsNum, EResult& eReturn) {
- int32_t iCoreNum = 1;
-#ifndef X86_ASM
- uint32_t uiCPUFlag = 0;
-#else
- uint32_t uiCPUFlag = WelsCPUFeatureDetect (&iCoreNum);
-#endif
-
- for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
- IStrategy* pStrategy = m_pStgChain[i];
- pStrategy = CreateStrategy (WelsStaticCast (EMethods, i + 1), uiCPUFlag);
- m_pStgChain[i] = pStrategy;
- }
-
- WelsMutexInit (&m_mutes);
-
- eReturn = RET_SUCCESS;
-}
-
-CVpFrameWork::~CVpFrameWork() {
- for (int32_t i = 0; i < MAX_STRATEGY_NUM; i++) {
- if (m_pStgChain[i]) {
- Uninit (m_pStgChain[i]->m_eMethod);
- _SafeDelete (m_pStgChain[i]);
- }
- }
-
- WelsMutexDestroy (&m_mutes);
-}
-
-EResult CVpFrameWork::Init (int32_t iType, void* pCfg) {
- EResult eReturn = RET_SUCCESS;
- int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
- Uninit (iType);
-
- WelsMutexLock (&m_mutes);
-
- IStrategy* pStrategy = m_pStgChain[iCurIdx];
- if (pStrategy)
- eReturn = pStrategy->Init (0, pCfg);
-
- WelsMutexUnlock (&m_mutes);
-
- return eReturn;
-}
-
-EResult CVpFrameWork::Uninit (int32_t iType) {
- EResult eReturn = RET_SUCCESS;
- int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
- WelsMutexLock (&m_mutes);
-
- IStrategy* pStrategy = m_pStgChain[iCurIdx];
- if (pStrategy)
- eReturn = pStrategy->Uninit (0);
-
- WelsMutexUnlock (&m_mutes);
-
- return eReturn;
-}
-
-EResult CVpFrameWork::Flush (int32_t iType) {
- EResult eReturn = RET_SUCCESS;
-
- return eReturn;
-}
-
-EResult CVpFrameWork::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
- EResult eReturn = RET_NOTSUPPORTED;
- EMethods eMethod = WelsVpGetValidMethod (iType);
- int32_t iCurIdx = WelsStaticCast (int32_t, eMethod) - 1;
- SPixMap sSrcPic;
- SPixMap sDstPic;
- memset (&sSrcPic, 0, sizeof (sSrcPic)); // confirmed_safe_unsafe_usage
- memset (&sDstPic, 0, sizeof (sDstPic)); // confirmed_safe_unsafe_usage
-
- if (pSrcPixMap) sSrcPic = *pSrcPixMap;
- if (pDstPixMap) sDstPic = *pDstPixMap;
- if (!CheckValid (eMethod, sSrcPic, sDstPic))
- return RET_INVALIDPARAM;
-
- WelsMutexLock (&m_mutes);
-
- IStrategy* pStrategy = m_pStgChain[iCurIdx];
- if (pStrategy)
- eReturn = pStrategy->Process (0, &sSrcPic, &sDstPic);
-
- WelsMutexUnlock (&m_mutes);
-
- return eReturn;
-}
-
-EResult CVpFrameWork::Get (int32_t iType, void* pParam) {
- EResult eReturn = RET_SUCCESS;
- int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
- if (!pParam)
- return RET_INVALIDPARAM;
-
- WelsMutexLock (&m_mutes);
-
- IStrategy* pStrategy = m_pStgChain[iCurIdx];
- if (pStrategy)
- eReturn = pStrategy->Get (0, pParam);
-
- WelsMutexUnlock (&m_mutes);
-
- return eReturn;
-}
-
-EResult CVpFrameWork::Set (int32_t iType, void* pParam) {
- EResult eReturn = RET_SUCCESS;
- int32_t iCurIdx = WelsStaticCast (int32_t, WelsVpGetValidMethod (iType)) - 1;
-
- if (!pParam)
- return RET_INVALIDPARAM;
-
- WelsMutexLock (&m_mutes);
-
- IStrategy* pStrategy = m_pStgChain[iCurIdx];
- if (pStrategy)
- eReturn = pStrategy->Set (0, pParam);
-
- WelsMutexUnlock (&m_mutes);
-
- return eReturn;
-}
-
-EResult CVpFrameWork::SpecialFeature (int32_t iType, void* pIn, void* pOut) {
- EResult eReturn = RET_SUCCESS;
-
- return eReturn;
-}
-
-bool_t CVpFrameWork::CheckValid (EMethods eMethod, SPixMap& pSrcPixMap, SPixMap& pDstPixMap) {
- bool_t eReturn = FALSE;
-
- if (eMethod == METHOD_NULL)
- goto exit;
-
- if (eMethod != METHOD_COLORSPACE_CONVERT) {
- if (pSrcPixMap.pPixel[0]) {
- if (pSrcPixMap.eFormat != VIDEO_FORMAT_I420 && pSrcPixMap.eFormat != VIDEO_FORMAT_YV12)
- goto exit;
- }
- if (pSrcPixMap.pPixel[0] && pDstPixMap.pPixel[0]) {
- if (pDstPixMap.eFormat != pSrcPixMap.eFormat)
- goto exit;
- }
- }
-
- if (pSrcPixMap.pPixel[0]) {
- if (pSrcPixMap.sRect.iRectWidth <= 0 || pSrcPixMap.sRect.iRectWidth > MAX_WIDTH || pSrcPixMap.sRect.iRectHeight <= 0
- || pSrcPixMap.sRect.iRectHeight > MAX_HEIGHT)
- goto exit;
- if (pSrcPixMap.sRect.iRectTop >= pSrcPixMap.sRect.iRectHeight
- || pSrcPixMap.sRect.iRectLeft >= pSrcPixMap.sRect.iRectWidth || pSrcPixMap.sRect.iRectWidth > pSrcPixMap.iStride[0])
- goto exit;
- }
- if (pDstPixMap.pPixel[0]) {
- if (pDstPixMap.sRect.iRectWidth <= 0 || pDstPixMap.sRect.iRectWidth > MAX_WIDTH || pDstPixMap.sRect.iRectHeight <= 0
- || pDstPixMap.sRect.iRectHeight > MAX_HEIGHT)
- goto exit;
- if (pDstPixMap.sRect.iRectTop >= pDstPixMap.sRect.iRectHeight
- || pDstPixMap.sRect.iRectLeft >= pDstPixMap.sRect.iRectWidth || pDstPixMap.sRect.iRectWidth > pDstPixMap.iStride[0])
- goto exit;
- }
- eReturn = TRUE;
-
-exit:
- return eReturn;
-}
-
-IStrategy* CVpFrameWork::CreateStrategy (EMethods m_eMethod, int32_t iCpuFlag) {
- IStrategy* pStrategy = NULL;
-
- switch (m_eMethod) {
- case METHOD_COLORSPACE_CONVERT:
- //not support yet
- break;
- case METHOD_DENOISE:
- pStrategy = WelsDynamicCast (IStrategy*, new CDenoiser (iCpuFlag));
- break;
- case METHOD_SCENE_CHANGE_DETECTION:
- pStrategy = WelsDynamicCast (IStrategy*, new CSceneChangeDetection (iCpuFlag));
- break;
- case METHOD_DOWNSAMPLE:
- pStrategy = WelsDynamicCast (IStrategy*, new CDownsampling (iCpuFlag));
- break;
- case METHOD_VAA_STATISTICS:
- pStrategy = WelsDynamicCast (IStrategy*, new CVAACalculation (iCpuFlag));
- break;
- case METHOD_BACKGROUND_DETECTION:
- pStrategy = WelsDynamicCast (IStrategy*, new CBackgroundDetection (iCpuFlag));
- break;
- case METHOD_ADAPTIVE_QUANT:
- pStrategy = WelsDynamicCast (IStrategy*, new CAdaptiveQuantization (iCpuFlag));
- break;
- case METHOD_COMPLEXITY_ANALYSIS:
- pStrategy = WelsDynamicCast (IStrategy*, new CComplexityAnalysis (iCpuFlag));
- break;
- case METHOD_IMAGE_ROTATE:
- pStrategy = WelsDynamicCast (IStrategy*, new CImageRotating (iCpuFlag));
- break;
- default:
- break;
- }
-
- return pStrategy;
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/common/WelsFrameWork.h
+++ /dev/null
@@ -1,130 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : WelsFrameWork.h
- *
- * \brief : framework of wels video processor class
- *
- * \date : 2011/01/04
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_WELSFRAMEWORK_H
-#define WELSVP_WELSFRAMEWORK_H
-
-#include "../../interface/IWelsVP.h"
-#include "util.h"
-#include "thread.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-EResult CreateSpecificVpInterface (IWelsVP** ppCtx);
-EResult DestroySpecificVpInterface (IWelsVP* pCtx);
-
-EResult CreateSpecificVpInterface (IWelsVPc** ppCtx);
-EResult DestroySpecificVpInterface (IWelsVPc* pCtx);
-
-#define MAX_STRATEGY_NUM (METHOD_MASK - 1)
-
-class IStrategy : public IWelsVP {
- public:
- IStrategy() {
- m_eMethod = METHOD_NULL;
- m_eFormat = VIDEO_FORMAT_I420;
- m_iIndex = 0;
- m_bInit = FALSE;
- };
-
- virtual ~IStrategy() {}
-
- public:
- virtual EResult Init (int32_t iType, void* pCfg) {
- return RET_SUCCESS;
- }
- virtual EResult Uninit (int32_t iType) {
- return RET_SUCCESS;
- }
- virtual EResult Flush (int32_t iType) {
- return RET_SUCCESS;
- }
- virtual EResult Get (int32_t iType, void* pParam) {
- return RET_SUCCESS;
- }
- virtual EResult Set (int32_t iType, void* pParam) {
- return RET_SUCCESS;
- }
- virtual EResult SpecialFeature (int32_t iType, void* pIn, void* pOut) {
- return RET_SUCCESS;
- }
- virtual EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) = 0;
-
- public:
- EMethods m_eMethod;
- EVideoFormat m_eFormat;
- int32_t m_iIndex;
- bool_t m_bInit;
-};
-
-class CVpFrameWork : public IWelsVP {
- public:
- CVpFrameWork (uint32_t uiThreadsNum, EResult& ret);
- ~CVpFrameWork();
-
- public:
- EResult Init (int32_t iType, void* pCfg);
-
- EResult Uninit (int32_t iType);
-
- EResult Flush (int32_t iType);
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
-
- EResult Get (int32_t iType, void* pParam);
-
- EResult Set (int32_t iType, void* pParam);
-
- EResult SpecialFeature (int32_t iType, void* pIn, void* pOut);
-
- private:
- bool_t CheckValid (EMethods eMethod, SPixMap& sSrc, SPixMap& sDst);
- IStrategy* CreateStrategy (EMethods eMethod, int32_t iCpuFlag);
-
- private:
- IStrategy* m_pStgChain[MAX_STRATEGY_NUM];
-
- WELS_MUTEX m_mutes;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/WelsFrameWorkEx.cpp
+++ /dev/null
@@ -1,96 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "WelsFrameWork.h"
-
-///////////////////////////////////////////////////////////////////////
-
-WELSVP_NAMESPACE_BEGIN
-
-EResult Init (void* pCtx, int32_t iType, void* pCfg) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Init (iType, pCfg) : RET_INVALIDPARAM;
-}
-EResult Uninit (void* pCtx, int32_t iType) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Uninit (iType) : RET_INVALIDPARAM;
-}
-EResult Flush (void* pCtx, int32_t iType) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Flush (iType) : RET_INVALIDPARAM;
-}
-EResult Process (void* pCtx, int32_t iType, SPixMap* pSrc, SPixMap* dst) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Process (iType, pSrc, dst) : RET_INVALIDPARAM;
-}
-EResult Get (void* pCtx, int32_t iType, void* pParam) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Get (iType, pParam) : RET_INVALIDPARAM;
-}
-EResult Set (void* pCtx, int32_t iType, void* pParam) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->Set (iType, pParam) : RET_INVALIDPARAM;
-}
-EResult SpecialFeature (void* pCtx, int32_t iType, void* pIn, void* pOut) {
- return pCtx ? WelsStaticCast (IWelsVP*, pCtx)->SpecialFeature (iType, pIn, pOut) : RET_INVALIDPARAM;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-EResult CreateSpecificVpInterface (IWelsVPc** pCtx) {
- EResult ret = RET_FAILED;
- IWelsVP* pWelsVP = NULL;
-
- ret = CreateSpecificVpInterface (&pWelsVP);
- if (ret == RET_SUCCESS) {
- IWelsVPc* pVPc = new IWelsVPc;
- if (pVPc) {
- pVPc->Init = Init;
- pVPc->Uninit = Uninit;
- pVPc->Flush = Flush;
- pVPc->Process = Process;
- pVPc->Get = Get;
- pVPc->Set = Set;
- pVPc->SpecialFeature = SpecialFeature;
- pVPc->pCtx = WelsStaticCast (void*, pWelsVP);
- *pCtx = pVPc;
- } else
- ret = RET_OUTOFMEMORY;
- }
-
- return ret;
-}
-
-EResult DestroySpecificVpInterface (IWelsVPc* pCtx) {
- if (pCtx) {
- DestroySpecificVpInterface (WelsStaticCast (IWelsVP*, pCtx->pCtx));
- _SafeDelete (pCtx);
- }
-
- return RET_SUCCESS;
-}
-
-WELSVP_NAMESPACE_END
binary files a/processing/src/common/WelsVP.aps /dev/null differ
--- a/processing/src/common/WelsVP.def
+++ /dev/null
@@ -1,36 +1,0 @@
-;*!
-;* \copy
-;* Copyright (c) 2011-2013, Cisco Systems
-;* All rights reserved.
-;*
-;* Redistribution and use in source and binary forms, with or without
-;* modification, are permitted provided that the following conditions
-;* are met:
-;*
-;* * Redistributions of source code must retain the above copyright
-;* notice, this list of conditions and the following disclaimer.
-;*
-;* * Redistributions in binary form must reproduce the above copyright
-;* notice, this list of conditions and the following disclaimer in
-;* the documentation and/or other materials provided with the
-;* distribution.
-;*
-;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;* POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY welsvp.dll
-EXPORTS
- CreateVpInterface PRIVATE
- DestroyVpInterface PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ /dev/null
@@ -1,115 +1,0 @@
-// Microsoft Visual C++ generated resource script.
-//
-#include "resource.h"
-
-#define APSTUDIO_READONLY_SYMBOLS
-/////////////////////////////////////////////////////////////////////////////
-//
-// Generated from the TEXTINCLUDE 2 resource.
-//
-#include "windows.h"
-
-/////////////////////////////////////////////////////////////////////////////
-#undef APSTUDIO_READONLY_SYMBOLS
-
-/////////////////////////////////////////////////////////////////////////////
-// Chinese (P.R.C.) resources
-
-#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_CHS)
-#ifdef _WIN32
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-#pragma code_page(936)
-#endif //_WIN32
-
-#ifdef APSTUDIO_INVOKED
-/////////////////////////////////////////////////////////////////////////////
-//
-// TEXTINCLUDE
-//
-
-1 TEXTINCLUDE
-BEGIN
- "resource.h\0"
-END
-
-2 TEXTINCLUDE
-BEGIN
- "#include ""windows.h""\r\n"
- "\0"
-END
-
-3 TEXTINCLUDE
-BEGIN
- "\r\n"
- "\0"
-END
-
-#endif // APSTUDIO_INVOKED
-
-#endif // Chinese (P.R.C.) resources
-/////////////////////////////////////////////////////////////////////////////
-
-
-/////////////////////////////////////////////////////////////////////////////
-// English (U.S.) resources
-
-#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
-#ifdef _WIN32
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
-#pragma code_page(1252)
-#endif //_WIN32
-
-/////////////////////////////////////////////////////////////////////////////
-//
-// Version
-//
-
-VS_VERSION_INFO VERSIONINFO
- FILEVERSION 0,0,0,0
- PRODUCTVERSION 0,0,0,0
- FILEFLAGSMASK 0x3fL
-#ifdef _DEBUG
- FILEFLAGS 0x1L
-#else
- FILEFLAGS 0x0L
-#endif
- FILEOS 0x40004L
- FILETYPE 0x2L
- FILESUBTYPE 0x0L
-BEGIN
- BLOCK "StringFileInfo"
- BEGIN
- BLOCK "040904b0"
- BEGIN
- VALUE "Comments", "Cisco OpenH264 video preprocessing"
- VALUE "CompanyName", "Cisco Systems"
- VALUE "FileDescription", "Cisco OpenH264 video preprocessing"
- VALUE "FileVersion", "0, 0, 0, 0"
- VALUE "InternalName", "welsvp.dll"
- VALUE "LegalCopyright", "� 2011-2015 Cisco and/or its affiliates. All rights reserved."
- VALUE "OriginalFilename", "welsvp.dll"
- VALUE "ProductName", "Cisco OpenH264 video preprocessing"
- VALUE "ProductVersion", "0, 0, 0, 0"
- END
- END
- BLOCK "VarFileInfo"
- BEGIN
- VALUE "Translation", 0x409, 1200
- END
-END
-
-#endif // English (U.S.) resources
-/////////////////////////////////////////////////////////////////////////////
-
-
-
-#ifndef APSTUDIO_INVOKED
-/////////////////////////////////////////////////////////////////////////////
-//
-// Generated from the TEXTINCLUDE 3 resource.
-//
-
-
-/////////////////////////////////////////////////////////////////////////////
-#endif // not APSTUDIO_INVOKED
-
--- a/processing/src/common/cpu.cpp
+++ /dev/null
@@ -1,196 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file cpu.c
- *
- * \brief CPU compatibility detection
- *
- * \date 04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#include "util.h"
-#include "cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define CPU_Vender_AMD "AuthenticAMD"
-#define CPU_Vender_INTEL "GenuineIntel"
-#define CPU_Vender_CYRIX "CyrixInstead"
-
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
- uint32_t uiCPU = 0;
- uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
- int32_t CacheLineSize = 0;
- int8_t chVenderName[16] = { 0 };
-
- if (!WelsCPUIdVerify()) {
- /* cpuid is not supported in cpu */
- return 0;
- }
-
- WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
- if (uiFeatureA == 0) {
- /* maximum input value for basic cpuid information */
- return 0;
- }
-
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if ((uiFeatureD & 0x00800000) == 0) {
- /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
- return 0;
- }
-
- uiCPU = WELS_CPU_MMX;
- if (uiFeatureD & 0x02000000) {
- /* SSE technology is identical to AMD MMX extensions */
- uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
- }
- if (uiFeatureD & 0x04000000) {
- /* SSE2 support here */
- uiCPU |= WELS_CPU_SSE2;
- }
- if (uiFeatureD & 0x00000001) {
- /* x87 FPU on-chip checking */
- uiCPU |= WELS_CPU_FPU;
- }
- if (uiFeatureD & 0x00008000) {
- /* CMOV instruction checking */
- uiCPU |= WELS_CPU_CMOV;
- }
- if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) { // confirmed_safe_unsafe_usage
- if (uiFeatureD & 0x10000000) {
- /* Multi-Threading checking: contains of multiple logic processors */
- uiCPU |= WELS_CPU_HTT;
- }
- }
-
- if (uiFeatureC & 0x00000001) {
- /* SSE3 support here */
- uiCPU |= WELS_CPU_SSE3;
- }
- if (uiFeatureC & 0x00000200) {
- /* SSSE3 support here */
- uiCPU |= WELS_CPU_SSSE3;
- }
- if (uiFeatureC & 0x00080000) {
- /* SSE4.1 support here, 45nm Penryn processor */
- uiCPU |= WELS_CPU_SSE41;
- }
- if (uiFeatureC & 0x00100000) {
- /* SSE4.2 support here, next generation Nehalem processor */
- uiCPU |= WELS_CPU_SSE42;
- }
- if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) { //
- /* AVX supported */
- uiCPU |= WELS_CPU_AVX;
- }
- if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) { //
- /* AVX FMA supported */
- uiCPU |= WELS_CPU_FMA;
- }
- if (uiFeatureC & 0x02000000) {
- /* AES checking */
- uiCPU |= WELS_CPU_AES;
- }
- if (uiFeatureC & 0x00400000) {
- /* MOVBE checking */
- uiCPU |= WELS_CPU_MOVBE;
- }
-
- if (pNumberOfLogicProcessors != NULL) {
- // HTT enabled on chip
- *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
- }
-
- WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
- if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
- && (uiFeatureA >= 0x80000001)) { // confirmed_safe_unsafe_usage
- WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if (uiFeatureD & 0x00400000) {
- uiCPU |= WELS_CPU_MMXEXT;
- }
- if (uiFeatureD & 0x80000000) {
- uiCPU |= WELS_CPU_3DNOW;
- }
- }
-
- if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) { // confirmed_safe_unsafe_usage
- int32_t family, model;
-
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
- model = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
-
- if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
- uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
- }
- }
-
- // get cache line size
- if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
- || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) { // confirmed_safe_unsafe_usage
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
- CacheLineSize = (uiFeatureB & 0xff00) >>
- 5; // ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
- if (CacheLineSize == 128) {
- uiCPU |= WELS_CPU_CACHELINE_128;
- } else if (CacheLineSize == 64) {
- uiCPU |= WELS_CPU_CACHELINE_64;
- } else if (CacheLineSize == 32) {
- uiCPU |= WELS_CPU_CACHELINE_32;
- } else if (CacheLineSize == 16) {
- uiCPU |= WELS_CPU_CACHELINE_16;
- }
- }
-
- return uiCPU;
-}
-
-
-void WelsCPURestore (const uint32_t kuiCPU) {
- if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
- WelsEmms();
- }
-}
-
-#endif
-
-
-WELSVP_NAMESPACE_END
-
-
--- a/processing/src/common/cpu.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file cpu.h
- *
- * \brief CPU feature compatibility detection
- *
- * \date 04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_CPU_H
-#define WELSVP_CPU_H
-
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-/*
- * WELS CPU feature flags
- */
-#define WELS_CPU_MMX 0x00000001 /* mmx */
-#define WELS_CPU_MMXEXT 0x00000002 /* mmx-ext*/
-#define WELS_CPU_SSE 0x00000004 /* sse */
-#define WELS_CPU_SSE2 0x00000008 /* sse 2 */
-#define WELS_CPU_SSE3 0x00000010 /* sse 3 */
-#define WELS_CPU_SSE41 0x00000020 /* sse 4.1 */
-#define WELS_CPU_3DNOW 0x00000040 /* 3dnow! */
-#define WELS_CPU_3DNOWEXT 0x00000080 /* 3dnow! ext */
-#define WELS_CPU_ALTIVEC 0x00000100 /* altivec */
-#define WELS_CPU_SSSE3 0x00000200 /* ssse3 */
-#define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */
-
-/* CPU features application extensive */
-#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
-#define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */
-#define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
- physical processor package is capable of supporting more than one logic processor
- */
-#define WELS_CPU_CMOV 0x00004000 /* Conditional Move Instructions,
- also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
- */
-#define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */
-#define WELS_CPU_AES 0x00010000 /* AES instruction extensions */
-#define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */
-
-#define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */
-#define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */
-#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
-#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
-
-/*
- * Interfaces for CPU core feature detection as below
- */
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-
-int32_t WelsCPUIdVerify();
-
-void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
-int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
-int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
-
-void WelsEmms();
-
-WELSVP_EXTERN_C_END
-#endif
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/memory.cpp
+++ /dev/null
@@ -1,117 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "memory.h"
-
-WELSVP_NAMESPACE_BEGIN
-/////////////////////////////////////////////////////////////////////////////////
-
-void* WelsMalloc (const uint32_t kuiSize, str_t* pTag) {
- const int32_t kiSizeVoidPointer = sizeof (void**);
- const int32_t kiSizeInt32 = sizeof (int32_t);
- const int32_t kiAlignedBytes = ALIGNBYTES - 1;
-
- uint8_t* pBuf = (uint8_t*) ::malloc (kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
- uint8_t* pAlignedBuf = NULL;
-
- if (NULL == pBuf)
- return NULL;
-
- // to fill zero values
- WelsMemset (pBuf, 0, kuiSize + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32);
-
- pAlignedBuf = pBuf + kiAlignedBytes + kiSizeVoidPointer + kiSizeInt32;
- pAlignedBuf -= WelsCastFromPointer (pAlignedBuf) & kiAlignedBytes;
- * ((void**) (pAlignedBuf - kiSizeVoidPointer)) = pBuf;
- * ((int32_t*) (pAlignedBuf - (kiSizeVoidPointer + kiSizeInt32))) = kuiSize;
-
- return (pAlignedBuf);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void WelsFree (void* pPointer, str_t* pTag) {
- if (pPointer) {
- ::free (* (((void**) pPointer) - 1));
- }
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void* InternalReallocate (void* pPointer, const uint32_t kuiSize, str_t* pTag) {
- uint32_t iOldSize = 0;
- uint8_t* pNew = NULL;
- if (pPointer != NULL)
- iOldSize = * ((int32_t*) ((uint8_t*) pPointer - sizeof (void**) - sizeof (int32_t)));
- else
- return WelsMalloc (kuiSize, pTag);
-
- pNew = (uint8_t*)WelsMalloc (kuiSize, pTag);
- if (0 == pNew) {
- if (iOldSize > 0 && kuiSize > 0 && iOldSize >= kuiSize)
- return (pPointer);
- return 0;
- } else if (iOldSize > 0 && kuiSize > 0)
- memcpy (pNew, pPointer, (iOldSize < kuiSize) ? iOldSize : kuiSize);
- else
- return 0;
-
- WelsFree (pPointer, pTag);
- return (pNew);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-
-void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag) {
- const uint32_t kuiOldSize = *pRealSize;
- uint32_t kuiNewSize = 0;
- void* pLocalPointer = NULL;
- if (kuiOldSize >= kuiSize) // large enough of original block, so do nothing
- return (pPointer);
-
- // new request
- kuiNewSize = kuiSize + 15;
- kuiNewSize -= (kuiNewSize & 15);
- kuiNewSize += 32;
-
- pLocalPointer = InternalReallocate (pPointer, kuiNewSize, pTag);
- if (NULL != pLocalPointer) {
- *pRealSize = kuiNewSize;
- return (pLocalPointer);
- } else {
- return NULL;
- }
-
- return NULL; // something wrong
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/common/memory.h
+++ /dev/null
@@ -1,110 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : memory.h
- *
- * \brief : memory definition for wels video processor class
- *
- * \date : 2011/02/22
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_MEMORY_H
-#define WELSVP_MEMORY_H
-
-#include "util.h"
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-inline_t void* WelsMemset (void* pPointer, int32_t iValue, uint32_t uiSize) {
- return ::memset (pPointer, iValue, uiSize);
-}
-
-inline_t void* WelsMemcpy (void* pDst, const void* kpSrc, uint32_t uiSize) {
- return ::memcpy (pDst, kpSrc, uiSize);
-}
-
-inline_t int32_t WelsMemcmp (const void* kpBuf1, const void* kpBuf2, uint32_t uiSize) {
- return ::memcmp (kpBuf1, kpBuf2, uiSize);
-}
-
-/*!
-*************************************************************************************
-* \brief malloc with zero filled utilization in Wels
-*
-* \param i_size uiSize of memory block required
-*
-* \return allocated memory pointer exactly, failed in case of NULL return
-*
-* \note N/A
-*************************************************************************************
-*/
-void* WelsMalloc (const uint32_t kuiSize, str_t* pTag = NULL);
-
-/*!
-*************************************************************************************
-* \brief free utilization in Wels
-*
-* \param p data pointer to be free.
-* i.e, uint8_t *p = actual data to be free, argv = &p.
-*
-* \return NONE
-*
-* \note N/A
-*************************************************************************************
-*/
-void WelsFree (void* pPointer, str_t* pTag = NULL);
-
-/*!
-*************************************************************************************
-* \brief reallocation in Wels. Do nothing and continue using old block
-* in case the block is large enough currently
-*
-* \param p memory block required in old time
-* \param i_size new uiSize of memory block requested
-* \param sz_real pointer to the old uiSize of memory block
-*
-* \return reallocated memory pointer exactly, failed in case of NULL return
-*
-* \note N/A
-*************************************************************************************
-*/
-void* WelsRealloc (void* pPointer, uint32_t* pRealSize, const uint32_t kuiSize, str_t* pTag = NULL);
-
-//////////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
-
-#endif
-
-
--- a/processing/src/common/resource.h
+++ /dev/null
@@ -1,15 +1,0 @@
-//{{NO_DEPENDENCIES}}
-// Microsoft Visual C++ generated include file.
-// Used by WelsVP.rc
-//
-
-// Next default values for new objects
-//
-#ifdef APSTUDIO_INVOKED
-#ifndef APSTUDIO_READONLY_SYMBOLS
-#define _APS_NEXT_RESOURCE_VALUE 101
-#define _APS_NEXT_COMMAND_VALUE 40001
-#define _APS_NEXT_CONTROL_VALUE 1000
-#define _APS_NEXT_SYMED_VALUE 101
-#endif
-#endif
--- a/processing/src/common/thread.cpp
+++ /dev/null
@@ -1,93 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file thread.cpp
- *
- * \brief Interfaces introduced in thread programming
- *
- * \date 11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#include "thread.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(_WIN32)
-
-WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
- InitializeCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
- EnterCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
- LeaveCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
- DeleteCriticalSection (mutex);
-
- return WELS_THREAD_ERROR_OK;
-}
-
-#elif defined(__GNUC__)
-
-WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex) {
- return pthread_mutex_init (mutex, NULL);
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex) {
- return pthread_mutex_lock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex) {
- return pthread_mutex_unlock (mutex);
-}
-
-WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex) {
- return pthread_mutex_destroy (mutex);
-}
-
-#endif
-
-WELSVP_NAMESPACE_END
-
-
-
--- a/processing/src/common/thread.h
+++ /dev/null
@@ -1,89 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file thread.h
- *
- * \brief Interfaces introduced in thread programming
- *
- * \date 11/17/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_THREAD_H
-#define WELSVP_THREAD_H
-
-#include "typedef.h"
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-#elif defined(__GNUC__)
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <semaphore.h>
-#include <signal.h>
-#include <errno.h>
-
-#endif//WIN32
-
-WELSVP_NAMESPACE_BEGIN
-
-#if defined(_WIN32)
-
-typedef HANDLE WELS_THREAD_HANDLE;
-typedef CRITICAL_SECTION WELS_MUTEX;
-
-#elif defined(__GNUC__)
-
-typedef pthread_t WELS_THREAD_HANDLE;
-typedef pthread_mutex_t WELS_MUTEX;
-
-#endif
-
-typedef long_t WELS_THREAD_ERROR_CODE;
-
-#define WELS_THREAD_ERROR_OK 0
-#define WELS_THREAD_ERROR_GENERIAL ((unsigned long)(-1))
-#define WELS_THREAD_ERROR_WAIT_OBJECT_0 0
-#define WELS_THREAD_ERROR_WAIT_TIMEOUT ((unsigned long)0x00000102L)
-#define WELS_THREAD_ERROR_WAIT_FAILED WELS_THREAD_ERROR_GENERIAL
-
-WELS_THREAD_ERROR_CODE WelsMutexInit (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE WelsMutexLock (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE WelsMutexUnlock (WELS_MUTEX* mutex);
-WELS_THREAD_ERROR_CODE WelsMutexDestroy (WELS_MUTEX* mutex);
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/typedef.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : typedef.h
- *
- * \brief : basic type definition
- *
- * \date : 2011/01/04
- *
- * \description : 1. Define basic type with platform-independent;
- * 2. Define specific namespace to avoid name pollution;
- * 3. C++ ONLY;
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_TYPEDEF_H
-#define WELSVP_TYPEDEF_H
-
-#define WELSVP_EXTERN_C_BEGIN extern "C" {
-#define WELSVP_EXTERN_C_END }
-
-#define WELSVP_NAMESPACE_BEGIN namespace nsWelsVP {
-#define WELSVP_NAMESPACE_END }
-
-WELSVP_NAMESPACE_BEGIN
-
-#if ( defined(_WIN32) || defined(_WIN32) ) && defined(_MSC_VER)
-
-typedef char int8_t ;
-typedef unsigned char uint8_t ;
-typedef short int16_t ;
-typedef unsigned short uint16_t ;
-typedef int int32_t ;
-typedef unsigned int uint32_t ;
-typedef __int64 int64_t ;
-typedef unsigned __int64 uint64_t ;
-#define inline_t _inline
-
-#else // GCC
-
-typedef signed char int8_t
-; // [comment]: some compilers may identify the type "char" as "unsigned char" as default, so declare it explicit
-typedef unsigned char uint8_t ;
-typedef signed short int16_t ;
-typedef unsigned short uint16_t ;
-typedef signed int int32_t ;
-typedef unsigned int uint32_t ;
-typedef long long int64_t ;
-typedef unsigned long long uint64_t ;
-#define inline_t inline
-
-#endif
-
-typedef char str_t ; // [comment]: specific use plain char only for character parameters
-typedef long long_t ;
-typedef int32_t bool_t ;
-
-#if defined(_WIN32) || defined(_MACH_PLATFORM) || defined(__GNUC__)
-typedef float float_t ;
-typedef double double_t ;
-#endif
-
-#ifndef NULL
-#define NULL 0
-#endif
-
-enum {
- FALSE = 0,
- TRUE = !FALSE
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/common/util.cpp
+++ /dev/null
@@ -1,45 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "util.h"
-
-WELSVP_NAMESPACE_BEGIN
-/////////////////////////////////////////////////////////////////////////////////
-
-
-int32_t WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2) {
- return ::strcmp (kpStr1, kpStr2);
-}
-
-
-/////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
--- a/processing/src/common/util.h
+++ /dev/null
@@ -1,107 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : util.h
- *
- * \brief : utils for wels video processor class
- *
- * \date : 2011/01/04
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_UTIL_H
-#define WELSVP_UTIL_H
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include "typedef.h"
-#include "memory.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define MAX_WIDTH (4096)
-#define MAX_HEIGHT (2304)//MAX_FS_LEVEL51 (36864); MAX_FS_LEVEL51*256/4096 = 2304
-#define MB_WIDTH_LUMA (16)
-#define PESN (1e-6) // desired float precision
-
-#define MB_TYPE_INTRA4x4 0x00000001
-#define MB_TYPE_INTRA16x16 0x00000002
-#define MB_TYPE_INTRA_PCM 0x00000004
-#define MB_TYPE_INTRA (MB_TYPE_INTRA4x4 | MB_TYPE_INTRA16x16 | MB_TYPE_INTRA_PCM)
-#define IS_INTRA(type) ((type)&MB_TYPE_INTRA)
-
-#define WELS_MAX(x, y) ((x) > (y) ? (x) : (y))
-#define WELS_MIN(x, y) ((x) < (y) ? (x) : (y))
-#define WELS_SIGN(a) ((long_t)(a) >> 31)
-#define WELS_ABS(a) ((WELS_SIGN(a) ^ (long_t)(a)) - WELS_SIGN(a))
-#define WELS_CLAMP(x, minv, maxv) WELS_MIN(WELS_MAX(x, minv), maxv)
-
-#define ALIGNBYTES (16) /* Worst case is requiring alignment to an 16 byte boundary */
-#define WELS_ALIGN(iInput) ((iInput+(ALIGNMENT-1)) & ~(ALIGNMENT-1))
-#define WELS_ALIGN2(iInput) ((iInput+1) & ~1)
-#define WELS_ALIGN4(iInput) ((iInput+3) & ~3)
-#define WELS_ALIGN8(iInput) ((iInput+7) & ~7)
-
-#define WelsCastFromPointer(p) (reinterpret_cast<long_t>(p))
-#define WelsStaticCast(type, p) (static_cast<type>(p))
-#define WelsDynamicCast(type, p) (dynamic_cast<type>(p))
-
-#define GET_METHOD(x) ((x) & 0xff) // mask method as the lowest 8bits
-#define GET_SPECIAL(x) (((x) >> 8) & 0xff) // mask special flag as 8bits
-
-inline_t EMethods WelsVpGetValidMethod (int32_t a) {
- int32_t iMethod = GET_METHOD (a);
- return WelsStaticCast (EMethods, WELS_CLAMP (iMethod, METHOD_NULL + 1, METHOD_MASK - 1));
-}
-
-
-#define _SafeFree(p) if (p) { WelsFree(p); (p) = NULL; }
-#define _SafeDelete(p) if (p) { delete (p); (p) = NULL; }
-
-
-//////////////////////////////////////////////////////////////////////////////////////
-
-int32_t WelsStrCmp (const str_t* kpStr1, const str_t* kpStr2);
-
-
-//////////////////////////////////////////////////////////////////////////////////////
-WELSVP_NAMESPACE_END
-
-#endif
-
-
--- a/processing/src/complexityanalysis/ComplexityAnalysis.cpp
+++ /dev/null
@@ -1,304 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "ComplexityAnalysis.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CComplexityAnalysis::CComplexityAnalysis (int32_t iCpuFlag) {
- m_eMethod = METHOD_COMPLEXITY_ANALYSIS;
- m_pfGomSad = NULL;
- WelsMemset (&m_sComplexityAnalysisParam, 0, sizeof (m_sComplexityAnalysisParam));
-}
-
-CComplexityAnalysis::~CComplexityAnalysis() {
-}
-
-EResult CComplexityAnalysis::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- EResult eReturn = RET_SUCCESS;
-
- switch (m_sComplexityAnalysisParam.iComplexityAnalysisMode) {
- case FRAME_SAD:
- AnalyzeFrameComplexityViaSad (pSrcPixMap, pRefPixMap);
- break;
- case GOM_SAD:
- AnalyzeGomComplexityViaSad (pSrcPixMap, pRefPixMap);
- break;
- case GOM_VAR:
- AnalyzeGomComplexityViaVar (pSrcPixMap, pRefPixMap);
- break;
- default:
- eReturn = RET_INVALIDPARAM;
- break;
- }
-
- return eReturn;
-}
-
-
-EResult CComplexityAnalysis::Set (int32_t iType, void* pParam) {
- if (pParam == NULL) {
- return RET_INVALIDPARAM;
- }
-
- m_sComplexityAnalysisParam = * (SComplexityAnalysisParam*)pParam;
-
- return RET_SUCCESS;
-}
-
-EResult CComplexityAnalysis::Get (int32_t iType, void* pParam) {
- if (pParam == NULL) {
- return RET_INVALIDPARAM;
- }
-
- SComplexityAnalysisParam* sComplexityAnalysisParam = (SComplexityAnalysisParam*)pParam;
-
- sComplexityAnalysisParam->iFrameComplexity = m_sComplexityAnalysisParam.iFrameComplexity;
-
- return RET_SUCCESS;
-}
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-void CComplexityAnalysis::AnalyzeFrameComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- SVAACalcResult* pVaaCalcResults = NULL;
- pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
-
- m_sComplexityAnalysisParam.iFrameComplexity = pVaaCalcResults->iFrameSad;
-
- if (m_sComplexityAnalysisParam.iCalcBgd) { //BGD control
- m_sComplexityAnalysisParam.iFrameComplexity = (int32_t)GetFrameSadExcludeBackground (pSrcPixMap, pRefPixMap);
- }
-}
-
-int32_t CComplexityAnalysis::GetFrameSadExcludeBackground (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
- int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
- int32_t iMbWidth = iWidth >> 4;
- int32_t iMbHeight = iHeight >> 4;
- int32_t iMbNum = iMbWidth * iMbHeight;
-
- int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
- int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
- int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0;
-
- uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
- uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
- SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
- int32_t* pGomForegroundBlockNum = m_sComplexityAnalysisParam.pGomForegroundBlockNum;
-
- uint32_t uiFrameSad = 0;
- for (int32_t j = 0; j < iGomMbNum; j ++) {
- iGomMbStartIndex = j * iMbNumInGom;
- iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
-
- for (int32_t i = iGomMbStartIndex; i < iGomMbEndIndex; i ++) {
- if (pBackgroundMbFlag[i] == 0 || IS_INTRA (uiRefMbType[i])) {
- pGomForegroundBlockNum[j]++;
- uiFrameSad += pVaaCalcResults->pSad8x8[i][0];
- uiFrameSad += pVaaCalcResults->pSad8x8[i][1];
- uiFrameSad += pVaaCalcResults->pSad8x8[i][2];
- uiFrameSad += pVaaCalcResults->pSad8x8[i][3];
- }
- }
- }
-
- return (uiFrameSad);
-}
-
-
-void InitGomSadFunc (PGOMSadFunc& pfGomSad, uint8_t iCalcBgd) {
- pfGomSad = GomSampleSad;
-
- if (iCalcBgd) {
- pfGomSad = GomSampleSadExceptBackground;
- }
-}
-
-void GomSampleSad (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8, uint8_t pBackgroundMbFlag) {
- (*pGomForegroundBlockNum) ++;
- *pGomSad += pSad8x8[0];
- *pGomSad += pSad8x8[1];
- *pGomSad += pSad8x8[2];
- *pGomSad += pSad8x8[3];
-}
-
-void GomSampleSadExceptBackground (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
- uint8_t pBackgroundMbFlag) {
- if (pBackgroundMbFlag == 0) {
- (*pGomForegroundBlockNum) ++;
- *pGomSad += pSad8x8[0];
- *pGomSad += pSad8x8[1];
- *pGomSad += pSad8x8[2];
- *pGomSad += pSad8x8[3];
- }
-}
-
-void CComplexityAnalysis::AnalyzeGomComplexityViaSad (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
- int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
- int32_t iMbWidth = iWidth >> 4;
- int32_t iMbHeight = iHeight >> 4;
- int32_t iMbNum = iMbWidth * iMbHeight;
-
- int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
- int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
-
- int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
- int32_t iMbStartIndex = 0, iMbEndIndex = 0;
- int32_t iStartSampleIndex = 0;
-
- uint8_t* pBackgroundMbFlag = (uint8_t*)m_sComplexityAnalysisParam.pBackgroundMbFlag;
- uint32_t* uiRefMbType = (uint32_t*)m_sComplexityAnalysisParam.uiRefMbType;
- SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
- int32_t* pGomForegroundBlockNum = (int32_t*)m_sComplexityAnalysisParam.pGomForegroundBlockNum;
- int32_t* pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
-
- uint8_t* pRefY = NULL, *pSrcY = NULL;
- int32_t iRefStride = 0, iCurStride = 0;
-
- uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
- uint32_t uiGomSad = 0, uiFrameSad = 0;
-
- pRefY = (uint8_t*)pRefPixMap->pPixel[0];
- pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
-
- iRefStride = pRefPixMap->iStride[0];
- iCurStride = pSrcPixMap->iStride[0];
-
- InitGomSadFunc (m_pfGomSad, m_sComplexityAnalysisParam.iCalcBgd);
-
- for (int32_t j = 0; j < iGomMbNum; j ++) {
- uiGomSad = 0;
-
- iGomMbStartIndex = j * iMbNumInGom;
- iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
- iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth - iGomMbStartIndex / iMbWidth;
-
- iMbStartIndex = iGomMbStartIndex;
- iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
-
- iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
- MB_WIDTH_LUMA;
-
- do {
- pRefTmp = pRefY + iStartSampleIndex;
- pCurTmp = pSrcY + iStartSampleIndex;
-
- for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
- m_pfGomSad (&uiGomSad, pGomForegroundBlockNum + j, pVaaCalcResults->pSad8x8[i], pBackgroundMbFlag[i]
- && !IS_INTRA (uiRefMbType[i]));
- }
-
- iMbStartIndex = iMbEndIndex;
- iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth , iGomMbEndIndex);
-
- iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iRefStride + (iMbStartIndex % iMbWidth) *
- MB_WIDTH_LUMA;
-
- } while (--iGomMbRowNum);
-
- pGomComplexity[j] = uiGomSad;
- uiFrameSad += pGomComplexity[j];
- }
-
- m_sComplexityAnalysisParam.iFrameComplexity = uiFrameSad;
-}
-
-
-void CComplexityAnalysis::AnalyzeGomComplexityViaVar (SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
- int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
- int32_t iMbWidth = iWidth >> 4;
- int32_t iMbHeight = iHeight >> 4;
- int32_t iMbNum = iMbWidth * iMbHeight;
-
- int32_t iMbNumInGom = m_sComplexityAnalysisParam.iMbNumInGom;
- int32_t iGomMbNum = (iMbNum + iMbNumInGom - 1) / iMbNumInGom;
- int32_t iGomSampleNum = 0;
-
- int32_t iGomMbStartIndex = 0, iGomMbEndIndex = 0, iGomMbRowNum = 0;
- int32_t iMbStartIndex = 0, iMbEndIndex = 0;
- int32_t iStartSampleIndex = 0;
-
- SVAACalcResult* pVaaCalcResults = m_sComplexityAnalysisParam.pCalcResult;
- int32_t* pGomComplexity = (int32_t*)m_sComplexityAnalysisParam.pGomComplexity;
-
- uint8_t* pSrcY = NULL;
- int32_t iCurStride = 0;
-
- uint8_t* pCurTmp = NULL;
- uint32_t uiSampleSum = 0, uiSquareSum = 0;
-
- pSrcY = (uint8_t*)pSrcPixMap->pPixel[0];
- iCurStride = pSrcPixMap->iStride[0];
-
- for (int32_t j = 0; j < iGomMbNum; j ++) {
- uiSampleSum = 0;
- uiSquareSum = 0;
-
- iGomMbStartIndex = j * iMbNumInGom;
- iGomMbEndIndex = WELS_MIN ((j + 1) * iMbNumInGom, iMbNum);
- iGomMbRowNum = (iGomMbEndIndex + iMbWidth - 1) / iMbWidth - iGomMbStartIndex / iMbWidth;
-
- iMbStartIndex = iGomMbStartIndex;
- iMbEndIndex = WELS_MIN ((iMbStartIndex / iMbWidth + 1) * iMbWidth, iGomMbEndIndex);
-
- iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
- MB_WIDTH_LUMA;
- iGomSampleNum = (iMbEndIndex - iMbStartIndex) * MB_WIDTH_LUMA * MB_WIDTH_LUMA;
-
- do {
- pCurTmp = pSrcY + iStartSampleIndex;
-
- for (int32_t i = iMbStartIndex; i < iMbEndIndex; i ++) {
- uiSampleSum += pVaaCalcResults->pSum16x16[i];
- uiSquareSum += pVaaCalcResults->pSumOfSquare16x16[i];
- }
-
- iMbStartIndex = iMbEndIndex;
- iMbEndIndex = WELS_MIN (iMbEndIndex + iMbWidth, iGomMbEndIndex);
-
- iStartSampleIndex = (iMbStartIndex / iMbWidth) * MB_WIDTH_LUMA * iCurStride + (iMbStartIndex % iMbWidth) *
- MB_WIDTH_LUMA;
- } while (--iGomMbRowNum);
-
- pGomComplexity[j] = uiSquareSum - (uiSampleSum * uiSampleSum / iGomSampleNum);
- }
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/complexityanalysis/ComplexityAnalysis.h
+++ /dev/null
@@ -1,83 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
-* \file : ComplexityAnalysis.h
-*
-* \brief : complexity analysis class of wels video processor class
-*
-* \date : 2011/03/28
-*
-* \description : 1. rewrite the package code of complexity analysis class
-*
-*************************************************************************************
-*/
-
-#ifndef WELSVP_COMPLEXITYANALYSIS_H
-#define WELSVP_COMPLEXITYANALYSIS_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (GOMSadFunc) (uint32_t* pGomSad, int32_t* pGomForegroundBlockNum, int32_t* pSad8x8,
- uint8_t pBackgroundMbFlag);
-
-typedef GOMSadFunc* PGOMSadFunc;
-
-GOMSadFunc GomSampleSad;
-GOMSadFunc GomSampleSadExceptBackground;
-
-class CComplexityAnalysis : public IStrategy {
- public:
- CComplexityAnalysis (int32_t iCpuFlag);
- ~CComplexityAnalysis();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
- EResult Set (int32_t iType, void* pParam);
- EResult Get (int32_t iType, void* pParam);
-
- private:
- void AnalyzeFrameComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
- int32_t GetFrameSadExcludeBackground (SPixMap* pSrc, SPixMap* pRef);
-
- void AnalyzeGomComplexityViaSad (SPixMap* pSrc, SPixMap* pRef);
- void AnalyzeGomComplexityViaVar (SPixMap* pSrc, SPixMap* pRef);
-
- private:
- PGOMSadFunc m_pfGomSad;
- SComplexityAnalysisParam m_sComplexityAnalysisParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/denoise/denoise.cpp
+++ /dev/null
@@ -1,124 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "denoise.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define CALC_BI_STRIDE(iWidth, iBitcount) ((((iWidth) * (iBitcount) + 31) & ~31) >> 3)
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CDenoiser::CDenoiser (int32_t iCpuFlag) {
- m_CPUFlag = iCpuFlag;
- m_eMethod = METHOD_DENOISE;
- WelsMemset (&m_pfDenoise, 0, sizeof (m_pfDenoise));
-
- m_uiSpaceRadius = DENOISE_GRAY_RADIUS;
- m_fSigmaGrey = DENOISE_GRAY_SIGMA;
- m_uiType = DENOISE_ALL_COMPONENT;
- InitDenoiseFunc (m_pfDenoise, m_CPUFlag);
-}
-
-CDenoiser::~CDenoiser() {
-}
-
-void CDenoiser::InitDenoiseFunc (SDenoiseFuncs& denoiser, int32_t iCpuFlag) {
- denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_c;
- denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_c;
-#if defined(X86_ASM)
- if (iCpuFlag & WELS_CPU_SSE2) {
- denoiser.pfBilateralLumaFilter8 = BilateralLumaFilter8_sse2;
- denoiser.pfWaverageChromaFilter8 = WaverageChromaFilter8_sse2;
- }
-#endif
-}
-
-EResult CDenoiser::Process (int32_t iType, SPixMap* pSrc, SPixMap* dst) {
- uint8_t* pSrcY = (uint8_t*)pSrc->pPixel[0];
- uint8_t* pSrcU = (uint8_t*)pSrc->pPixel[1];
- uint8_t* pSrcV = (uint8_t*)pSrc->pPixel[2];
- if (pSrcY == NULL || pSrcU == NULL || pSrcV == NULL) {
- return RET_INVALIDPARAM;
- }
-
- int32_t iWidthY = pSrc->sRect.iRectWidth;
- int32_t iHeightY = pSrc->sRect.iRectHeight;
- int32_t iWidthUV = iWidthY >> 1;
- int32_t iHeightUV = iHeightY >> 1;
-
- if (m_uiType & DENOISE_Y_COMPONENT)
- BilateralDenoiseLuma (pSrcY, iWidthY, iHeightY, pSrc->iStride[0]);
-
- if (m_uiType & DENOISE_U_COMPONENT)
- WaverageDenoiseChroma (pSrcU, iWidthUV, iHeightUV, pSrc->iStride[1]);
-
- if (m_uiType & DENOISE_V_COMPONENT)
- WaverageDenoiseChroma (pSrcV, iWidthUV, iHeightUV, pSrc->iStride[2]);
-
- return RET_SUCCESS;
-}
-
-void CDenoiser::BilateralDenoiseLuma (uint8_t* pSrcY, int32_t iWidth, int32_t iHeight, int32_t iStride) {
- int32_t w;
-
- pSrcY = pSrcY + m_uiSpaceRadius * iStride;
- for (int32_t h = m_uiSpaceRadius; h < iHeight - m_uiSpaceRadius; h++) {
- for (w = m_uiSpaceRadius; w < iWidth - m_uiSpaceRadius - TAIL_OF_LINE8; w += 8) {
- m_pfDenoise.pfBilateralLumaFilter8 (pSrcY + w, iStride);
- }
- for (w = w + TAIL_OF_LINE8; w < iWidth - m_uiSpaceRadius; w++) {
- Gauss3x3Filter (pSrcY + w, iStride);
- }
- pSrcY += iStride;
- }
-}
-
-void CDenoiser::WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t iWidth, int32_t iHeight, int32_t iStride) {
- int32_t w;
-
- pSrcUV = pSrcUV + UV_WINDOWS_RADIUS * iStride;
- for (int32_t h = UV_WINDOWS_RADIUS; h < iHeight - UV_WINDOWS_RADIUS; h++) {
- for (w = UV_WINDOWS_RADIUS; w < iWidth - UV_WINDOWS_RADIUS - TAIL_OF_LINE8; w += 8) {
- m_pfDenoise.pfWaverageChromaFilter8 (pSrcUV + w, iStride);
- }
-
- for (w = w + TAIL_OF_LINE8; w < iWidth - UV_WINDOWS_RADIUS; w++) {
- Gauss3x3Filter (pSrcUV + w, iStride);
- }
- pSrcUV += iStride;
- }
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/denoise/denoise.h
+++ /dev/null
@@ -1,111 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : denoise.h
- *
- * \brief : denoise class of wels video processor class
- *
- * \date : 2011/03/15
- *
- * \description : 1. rewrite the package code of denoise class
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_DENOISE_H
-#define WELSVP_DENOISE_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-
-#define DENOISE_GRAY_RADIUS (1)
-#define DENOISE_GRAY_SIGMA (2)
-
-#define UV_WINDOWS_RADIUS (2)
-#define TAIL_OF_LINE8 (7)
-
-#define DENOISE_Y_COMPONENT (1)
-#define DENOISE_U_COMPONENT (2)
-#define DENOISE_V_COMPONENT (4)
-#define DENOISE_ALL_COMPONENT (7)
-
-
-WELSVP_NAMESPACE_BEGIN
-
-void Gauss3x3Filter (uint8_t* pixels, int32_t stride);
-
-typedef void (DenoiseFilterFunc) (uint8_t* pixels, int32_t stride);
-
-typedef DenoiseFilterFunc* DenoiseFilterFuncPtr;
-
-DenoiseFilterFunc BilateralLumaFilter8_c;
-DenoiseFilterFunc WaverageChromaFilter8_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-DenoiseFilterFunc BilateralLumaFilter8_sse2 ;
-DenoiseFilterFunc WaverageChromaFilter8_sse2 ;
-WELSVP_EXTERN_C_END
-#endif
-
-typedef struct TagDenoiseFuncs {
- DenoiseFilterFuncPtr pfBilateralLumaFilter8;//on 8 samples
- DenoiseFilterFuncPtr pfWaverageChromaFilter8;//on 8 samples
-} SDenoiseFuncs;
-
-class CDenoiser : public IStrategy {
- public:
- CDenoiser (int32_t iCpuFlag);
- ~CDenoiser();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* dst);
-
- private:
- void InitDenoiseFunc (SDenoiseFuncs& pf, int32_t cpu);
- void BilateralDenoiseLuma (uint8_t* p_y_data, int32_t width, int32_t height, int32_t stride);
- void WaverageDenoiseChroma (uint8_t* pSrcUV, int32_t width, int32_t height, int32_t stride);
-
- private:
- float_t m_fSigmaGrey; //sigma for grey scale similarity, suggestion 2.5-3
- uint32_t m_uiFilterWindow; //filter window diameter
- uint16_t m_uiSpaceRadius; //filter windows radius: 1-3x3, 2-5x5,3-7x7. Larger size, slower speed
- uint16_t m_uiType; //do denoising on which component 1-Y, 2-U, 4-V; 7-YUV, 3-YU, 5-YV, 6-UV
- uint32_t* m_pGreyWeightTable; //weight table for grey scale
-
- SDenoiseFuncs m_pfDenoise;
- int32_t m_CPUFlag;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/denoise/denoise_filter.cpp
+++ /dev/null
@@ -1,127 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2010-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file svc_preprocess.h
- *
- * \brief svc denoising
- *
- * \date 4/1/2010 Created
- *
- */
-
-#include "denoise.h"
-#include "../common/typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void BilateralLumaFilter8_c (uint8_t* pSample, int32_t iStride) {
- int32_t nSum = 0, nTotWeight = 0;
- int32_t iCenterSample = *pSample;
- uint8_t* pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
- int32_t x, y;
- int32_t iCurSample, iCurWeight, iGreyDiff;
- uint8_t aSample[8];
-
- for (int32_t i = 0; i < 8; i++) {
- nSum = 0;
- nTotWeight = 0;
- iCenterSample = *pSample;
- pCurLine = pSample - iStride - DENOISE_GRAY_RADIUS;
- for (y = 0; y < 3; y++) {
- for (x = 0; x < 3; x++) {
- if (x == 1 && y == 1) continue; // except center point
- iCurSample = pCurLine[x];
- iCurWeight = WELS_ABS (iCurSample - iCenterSample);
- iGreyDiff = 32 - iCurWeight;
- if (iGreyDiff < 0) continue;
- else iCurWeight = (iGreyDiff * iGreyDiff) >> 5;
- nSum += iCurSample * iCurWeight;
- nTotWeight += iCurWeight;
- }
- pCurLine += iStride;
- }
- nTotWeight = 256 - nTotWeight;
- nSum += iCenterSample * nTotWeight;
- aSample[i] = nSum >> 8;
- pSample++;
- }
- WelsMemcpy (pSample - 8, aSample, 8);
-}
-
-
-/***************************************************************************
-5x5 filter:
-1 1 2 1 1
-1 2 4 2 1
-2 4 20 4 2
-1 2 4 2 1
-1 1 2 1 1
-***************************************************************************/
-#define SUM_LINE1(pSample) (pSample[0] +(pSample[1]) +(pSample[2]<<1) + pSample[3] + pSample[4])
-#define SUM_LINE2(pSample) (pSample[0] +(pSample[1]<<1) +(pSample[2]<<2) +(pSample[3]<<1) +pSample[4])
-#define SUM_LINE3(pSample) ((pSample[0]<<1) +(pSample[1]<<2) +(pSample[2]*20) +(pSample[3]<<2) +(pSample[4]<<1))
-void WaverageChromaFilter8_c (uint8_t* pSample, int32_t iStride) {
- int32_t sum;
- uint8_t* pStartPixels = pSample - UV_WINDOWS_RADIUS * iStride - UV_WINDOWS_RADIUS;
- uint8_t* pCurLine1 = pStartPixels;
- uint8_t* pCurLine2 = pCurLine1 + iStride;
- uint8_t* pCurLine3 = pCurLine2 + iStride;
- uint8_t* pCurLine4 = pCurLine3 + iStride;
- uint8_t* pCurLine5 = pCurLine4 + iStride;
- uint8_t aSample[8];
-
- for (int32_t i = 0; i < 8; i++) {
- sum = SUM_LINE1 ((pCurLine1 + i)) + SUM_LINE2 ((pCurLine2 + i)) + SUM_LINE3 ((pCurLine3 + i))
- + SUM_LINE2 ((pCurLine4 + i)) + SUM_LINE1 ((pCurLine5 + i));
- aSample[i] = (sum >> 6);
- pSample++;
- }
- WelsMemcpy (pSample - 8, aSample, 8);
-}
-
-/***************************************************************************
-edge of y/uv use a 3x3 Gauss filter, radius = 1:
-1 2 1
-2 4 2
-1 2 1
-***************************************************************************/
-void Gauss3x3Filter (uint8_t* pSrc, int32_t iStride) {
- int32_t nSum = 0;
- uint8_t* pCurLine1 = pSrc - iStride - 1;
- uint8_t* pCurLine2 = pCurLine1 + iStride;
- uint8_t* pCurLine3 = pCurLine2 + iStride;
-
- nSum = pCurLine1[0] + (pCurLine1[1] << 1) + pCurLine1[2] +
- (pCurLine2[0] << 1) + (pCurLine2[1] << 2) + (pCurLine2[2] << 1) +
- pCurLine3[0] + (pCurLine3[1] << 1) + pCurLine3[2];
- *pSrc = nSum >> 4;
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/downsample/downsample.cpp
+++ /dev/null
@@ -1,135 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "downsample.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CDownsampling::CDownsampling (int32_t iCpuFlag) {
- m_iCPUFlag = iCpuFlag;
- m_eMethod = METHOD_DOWNSAMPLE;
- WelsMemset (&m_pfDownsample, 0, sizeof (m_pfDownsample));
- InitDownsampleFuncs (m_pfDownsample, m_iCPUFlag);
-}
-
-CDownsampling::~CDownsampling() {
-}
-
-void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
- sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
-#if defined(X86_ASM)
- if (iCpuFlag & WELS_CPU_SSE) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
- sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
- }
- if (iCpuFlag & WELS_CPU_SSE2) {
- sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse2;
- sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
- }
- if (iCpuFlag & WELS_CPU_SSSE3) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
- }
- if (iCpuFlag & WELS_CPU_SSE41) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
- }
-#endif//X86_ASM
-
-}
-
-EResult CDownsampling::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pDstPixMap) {
- int32_t iSrcWidthY = pSrcPixMap->sRect.iRectWidth;
- int32_t iSrcHeightY = pSrcPixMap->sRect.iRectHeight;
- int32_t iDstWidthY = pDstPixMap->sRect.iRectWidth;
- int32_t iDstHeightY = pDstPixMap->sRect.iRectHeight;
-
- int32_t iSrcWidthUV = iSrcWidthY >> 1;
- int32_t iSrcHeightUV = iSrcHeightY >> 1;
- int32_t iDstWidthUV = iDstWidthY >> 1;
- int32_t iDstHeightUV = iDstHeightY >> 1;
-
- if (iSrcWidthY <= iDstWidthY || iSrcHeightY <= iDstHeightY) {
- return RET_INVALIDPARAM;
- }
-
- if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
- // use half average functions
- uint8_t iAlignIndex = 3;
-
- iAlignIndex = GetAlignedIndex (iSrcWidthY);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
- (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
- iAlignIndex = GetAlignedIndex (iSrcWidthUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
- (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
- (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
- } else {
- m_pfDownsample.pfGeneralRatioLuma ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0], iDstWidthY, iDstHeightY,
- (uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
- m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1], iDstWidthUV, iDstHeightUV,
- (uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
-
- m_pfDownsample.pfGeneralRatioChroma ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2], iDstWidthUV, iDstHeightUV,
- (uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
- }
- return RET_SUCCESS;
-}
-
-int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
- int32_t iAlignIndex = 3;
- if ((kiSrcWidth & 0x1f) == 0) // x32
- iAlignIndex = 0;
- else if ((kiSrcWidth & 0x0f) == 0) // x16
- iAlignIndex = 1;
- else if ((kiSrcWidth & 0x07) == 0) // x8
- iAlignIndex = 2;
- else
- iAlignIndex = 3;
- return iAlignIndex;
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/downsample/downsample.h
+++ /dev/null
@@ -1,128 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : downsample.h
- *
- * \brief : downsample class of wels video processor class
- *
- * \date : 2011/03/33
- *
- * \description : 1. rewrite the package code of downsample class
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_DOWNSAMPLE_H
-#define WELSVP_DOWNSAMPLE_H
-
-#include "../common/util.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-typedef void (HalveDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride,
- uint8_t* pSrc, const int32_t kiSrcStride,
- const int32_t kiSrcWidth, const int32_t kiSrcHeight);
-
-typedef void (GeneralDownsampleFunc) (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight);
-
-typedef HalveDownsampleFunc* PHalveDownsampleFunc;
-typedef GeneralDownsampleFunc* PGeneralDownsampleFunc;
-
-HalveDownsampleFunc DyadicBilinearDownsampler_c;
-GeneralDownsampleFunc GeneralBilinearFastDownsampler_c;
-GeneralDownsampleFunc GeneralBilinearAccurateDownsampler_c;
-
-typedef struct {
- // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
- PHalveDownsampleFunc pfHalfAverage[4];
- PGeneralDownsampleFunc pfGeneralRatioLuma;
- PGeneralDownsampleFunc pfGeneralRatioChroma;
-} SDownsampleFuncs;
-
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-// used for scr width is multipler of 8 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx8_sse;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_sse;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse;
-// used for scr width is multipler of 16 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_ssse3;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_ssse3;
-// iSrcWidth= x16 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx16_sse4;
-// iSrcWidth= x32 pixels
-HalveDownsampleFunc DyadicBilinearDownsamplerWidthx32_sse4;
-
-GeneralDownsampleFunc GeneralBilinearFastDownsamplerWrap_sse2;
-GeneralDownsampleFunc GeneralBilinearAccurateDownsamplerWrap_sse2;
-
-void GeneralBilinearFastDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
- const uint32_t kuiScaleX, const uint32_t kuiScaleY);
-void GeneralBilinearAccurateDownsampler_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight,
- const uint32_t kuiScaleX, const uint32_t kuiScaleY);
-WELSVP_EXTERN_C_END
-#endif
-
-
-
-
-class CDownsampling : public IStrategy {
- public:
- CDownsampling (int32_t iCpuFlag);
- ~CDownsampling();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
-
- private:
- void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
-
- int32_t GetAlignedIndex (const int32_t kiSrcWidth);
-
- private:
- SDownsampleFuncs m_pfDownsample;
- int32_t m_iCPUFlag;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/downsample/downsamplefuncs.cpp
+++ /dev/null
@@ -1,234 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2008-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * downsample_yuv.c
- *
- * Abstract
- * Implementation for source yuv data downsampling used before spatial encoding.
- *
- * History
- * 10/24/2008 Created
- *
- *****************************************************************************/
-
-#include "../common/typedef.h"
-#include "../common/util.h"
-#include "downsample.h"
-
-
-WELSVP_NAMESPACE_BEGIN
-
-
-void DyadicBilinearDownsampler_c (uint8_t* pDst, const int32_t kiDstStride,
- uint8_t* pSrc, const int32_t kiSrcStride,
- const int32_t kiSrcWidth, const int32_t kiSrcHeight)
-
-{
- uint8_t* pDstLine = pDst;
- uint8_t* pSrcLine = pSrc;
- const int32_t kiSrcStridex2 = kiSrcStride << 1;
- const int32_t kiDstWidth = kiSrcWidth >> 1;
- const int32_t kiDstHeight = kiSrcHeight >> 1;
-
- for (int32_t j = 0; j < kiDstHeight; j ++) {
- for (int32_t i = 0; i < kiDstWidth; i ++) {
- const int32_t kiSrcX = i << 1;
- const int32_t kiTempRow1 = (pSrcLine[kiSrcX] + pSrcLine[kiSrcX + 1] + 1) >> 1;
- const int32_t kiTempRow2 = (pSrcLine[kiSrcX + kiSrcStride] + pSrcLine[kiSrcX + kiSrcStride + 1] + 1) >> 1;
-
- pDstLine[i] = (uint8_t) ((kiTempRow1 + kiTempRow2 + 1) >> 1);
- }
- pDstLine += kiDstStride;
- pSrcLine += kiSrcStridex2;
- }
-}
-
-void GeneralBilinearFastDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const uint32_t kuiScaleBitWidth = 16, kuiScaleBitHeight = 15;
- const uint32_t kuiScaleWidth = (1 << kuiScaleBitWidth), kuiScaleHeight = (1 << kuiScaleBitHeight);
- int32_t fScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
- int32_t fScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
- uint32_t x;
- int32_t iYInverse, iXInverse;
-
- uint8_t* pByDst = pDst;
- uint8_t* pByLineDst = pDst;
-
- iYInverse = 1 << (kuiScaleBitHeight - 1);
- for (int32_t i = 0; i < kiDstHeight - 1; i++) {
- int32_t iYy = iYInverse >> kuiScaleBitHeight;
- int32_t fv = iYInverse & (kuiScaleHeight - 1);
-
- uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
- pByDst = pByLineDst;
- iXInverse = 1 << (kuiScaleBitWidth - 1);
- for (int32_t j = 0; j < kiDstWidth - 1; j++) {
- int32_t iXx = iXInverse >> kuiScaleBitWidth;
- int32_t iFu = iXInverse & (kuiScaleWidth - 1);
-
- uint8_t* pByCurrent = pBySrc + iXx;
- uint8_t a, b, c, d;
-
- a = *pByCurrent;
- b = * (pByCurrent + 1);
- c = * (pByCurrent + kiSrcStride);
- d = * (pByCurrent + kiSrcStride + 1);
-
- x = (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * a;
- x += (((uint32_t) (iFu)) * (kuiScaleHeight - 1 - fv) >> kuiScaleBitWidth) * b;
- x += (((uint32_t) (kuiScaleWidth - 1 - iFu)) * (fv) >> kuiScaleBitWidth) * c;
- x += (((uint32_t) (iFu)) * (fv) >> kuiScaleBitWidth) * d;
- x >>= (kuiScaleBitHeight - 1);
- x += 1;
- x >>= 1;
- //x = (((__int64)(SCALE_BIG - 1 - iFu))*(SCALE_BIG - 1 - fv)*a + ((__int64)iFu)*(SCALE_BIG - 1 -fv)*b + ((__int64)(SCALE_BIG - 1 -iFu))*fv*c +
- // ((__int64)iFu)*fv*d + (1 << (2*SCALE_BIT_BIG-1)) ) >> (2*SCALE_BIT_BIG);
- x = WELS_CLAMP (x, 0, 255);
- *pByDst++ = (uint8_t)x;
-
- iXInverse += fScalex;
- }
- *pByDst = * (pBySrc + (iXInverse >> kuiScaleBitWidth));
- pByLineDst += kiDstStride;
- iYInverse += fScaley;
- }
-
- // last row special
- {
- int32_t iYy = iYInverse >> kuiScaleBitHeight;
- uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
- pByDst = pByLineDst;
- iXInverse = 1 << (kuiScaleBitWidth - 1);
- for (int32_t j = 0; j < kiDstWidth; j++) {
- int32_t iXx = iXInverse >> kuiScaleBitWidth;
- *pByDst++ = * (pBySrc + iXx);
-
- iXInverse += fScalex;
- }
- }
-}
-
-void GeneralBilinearAccurateDownsampler_c (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBit = 15;
- const int32_t kiScale = (1 << kiScaleBit);
- int32_t iScalex = (int32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kiScale);
- int32_t iScaley = (int32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kiScale);
- int64_t x;
- int32_t iYInverse, iXInverse;
-
- uint8_t* pByDst = pDst;
- uint8_t* pByLineDst = pDst;
-
- iYInverse = 1 << (kiScaleBit - 1);
- for (int32_t i = 0; i < kiDstHeight - 1; i++) {
- int32_t iYy = iYInverse >> kiScaleBit;
- int32_t iFv = iYInverse & (kiScale - 1);
-
- uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
- pByDst = pByLineDst;
- iXInverse = 1 << (kiScaleBit - 1);
- for (int32_t j = 0; j < kiDstWidth - 1; j++) {
- int32_t iXx = iXInverse >> kiScaleBit;
- int32_t iFu = iXInverse & (kiScale - 1);
-
- uint8_t* pByCurrent = pBySrc + iXx;
- uint8_t a, b, c, d;
-
- a = *pByCurrent;
- b = * (pByCurrent + 1);
- c = * (pByCurrent + kiSrcStride);
- d = * (pByCurrent + kiSrcStride + 1);
-
- x = (((int64_t) (kiScale - 1 - iFu)) * (kiScale - 1 - iFv) * a + ((int64_t)iFu) * (kiScale - 1 - iFv) * b + ((int64_t) (
- kiScale - 1 - iFu)) * iFv * c +
- ((int64_t)iFu) * iFv * d + (int64_t) (1 << (2 * kiScaleBit - 1))) >> (2 * kiScaleBit);
- x = WELS_CLAMP (x, 0, 255);
- *pByDst++ = (uint8_t)x;
-
- iXInverse += iScalex;
- }
- *pByDst = * (pBySrc + (iXInverse >> kiScaleBit));
- pByLineDst += kiDstStride;
- iYInverse += iScaley;
- }
-
- // last row special
- {
- int32_t iYy = iYInverse >> kiScaleBit;
- uint8_t* pBySrc = pSrc + iYy * kiSrcStride;
-
- pByDst = pByLineDst;
- iXInverse = 1 << (kiScaleBit - 1);
- for (int32_t j = 0; j < kiDstWidth; j++) {
- int32_t iXx = iXInverse >> kiScaleBit;
- *pByDst++ = * (pBySrc + iXx);
-
- iXInverse += iScalex;
- }
- }
-}
-
-
-#ifdef X86_ASM
-void GeneralBilinearFastDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBitWidth = 16, kiScaleBitHeight = 15;
- const uint32_t kuiScaleWidth = (1 << kiScaleBitWidth), kuiScaleHeight = (1 << kiScaleBitHeight);
-
- uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScaleWidth);
- uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScaleHeight);
-
- GeneralBilinearFastDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
- pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-}
-
-void GeneralBilinearAccurateDownsamplerWrap_sse2 (uint8_t* pDst, const int32_t kiDstStride, const int32_t kiDstWidth,
- const int32_t kiDstHeight,
- uint8_t* pSrc, const int32_t kiSrcStride, const int32_t kiSrcWidth, const int32_t kiSrcHeight) {
- const int32_t kiScaleBit = 15;
- const uint32_t kuiScale = (1 << kiScaleBit);
-
- uint32_t uiScalex = (uint32_t) ((float_t)kiSrcWidth / (float_t)kiDstWidth * kuiScale);
- uint32_t uiScaley = (uint32_t) ((float_t)kiSrcHeight / (float_t)kiDstHeight * kuiScale);
-
- GeneralBilinearAccurateDownsampler_sse2 (pDst, kiDstStride, kiDstWidth, kiDstHeight,
- pSrc, kiSrcStride, kiSrcWidth, kiSrcHeight, uiScalex, uiScaley);
-}
-#endif //X86_ASM
-
-WELSVP_NAMESPACE_END
--- a/processing/src/imagerotate/imagerotate.cpp
+++ /dev/null
@@ -1,93 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "imagerotate.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CImageRotating::CImageRotating (int32_t iCpuFlag) {
- m_iCPUFlag = iCpuFlag;
- m_eMethod = METHOD_IMAGE_ROTATE;
- WelsMemset (&m_pfRotateImage, 0, sizeof (m_pfRotateImage));
- InitImageRotateFuncs (m_pfRotateImage, m_iCPUFlag);
-}
-
-CImageRotating::~CImageRotating() {
-}
-
-void CImageRotating::InitImageRotateFuncs (SImageRotateFuncs& sImageRotateFuncs, int32_t iCpuFlag) {
- sImageRotateFuncs.pfImageRotate90D = ImageRotate90D_c;
- sImageRotateFuncs.pfImageRotate180D = ImageRotate180D_c;
- sImageRotateFuncs.pfImageRotate270D = ImageRotate270D_c;
-}
-EResult CImageRotating::ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth,
- uint32_t iHeight, uint8_t* pDst) {
- if (iType == 90) {
- m_pfRotateImage.pfImageRotate90D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
- } else if (iType == 180) {
- m_pfRotateImage.pfImageRotate180D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
- } else if (iType == 270) {
- m_pfRotateImage.pfImageRotate270D (pSrc, uiBytesPerPixel, iWidth, iHeight, pDst);
- } else {
- return RET_NOTSUPPORTED;
- }
- return RET_SUCCESS;
-}
-
-EResult CImageRotating::Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst) {
- EResult eReturn = RET_INVALIDPARAM;
-
- if ((pSrc->eFormat == VIDEO_FORMAT_RGBA) ||
- (pSrc->eFormat == VIDEO_FORMAT_BGRA) ||
- (pSrc->eFormat == VIDEO_FORMAT_ABGR) ||
- (pSrc->eFormat == VIDEO_FORMAT_ARGB)) {
- eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
- pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
- } else if (pSrc->eFormat == VIDEO_FORMAT_I420) {
- ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[0], pSrc->iSizeInBits * 8, pSrc->sRect.iRectWidth,
- pSrc->sRect.iRectHeight, (uint8_t*)pDst->pPixel[0]);
- ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[1], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
- (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[1]);
- eReturn = ProcessImageRotate (iType, (uint8_t*)pSrc->pPixel[2], pSrc->iSizeInBits * 8, (pSrc->sRect.iRectWidth >> 1),
- (pSrc->sRect.iRectHeight >> 1), (uint8_t*)pDst->pPixel[2]);
- } else {
- eReturn = RET_NOTSUPPORTED;
- }
-
- return eReturn;
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/imagerotate/imagerotate.h
+++ /dev/null
@@ -1,85 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : downsample.h
- *
- * \brief : image rotate class of wels video processor class
- *
- * \date : 2011/04/06
- *
- * \description :
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_IMAGEROTATE_H
-#define WELSVP_IMAGEROTATE_H
-
-#include "../common/util.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (ImageRotateFunc) (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
- uint8_t* pDst);
-
-typedef ImageRotateFunc* ImageRotateFuncPtr;
-
-ImageRotateFunc ImageRotate90D_c;
-ImageRotateFunc ImageRotate180D_c;
-ImageRotateFunc ImageRotate270D_c;
-
-typedef struct {
- ImageRotateFuncPtr pfImageRotate90D;
- ImageRotateFuncPtr pfImageRotate180D;
- ImageRotateFuncPtr pfImageRotate270D;
-} SImageRotateFuncs;
-
-class CImageRotating : public IStrategy {
- public:
- CImageRotating (int32_t iCpuFlag);
- ~CImageRotating();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pDst);
-
- private:
- void InitImageRotateFuncs (SImageRotateFuncs& pf, int32_t iCpuFlag);
- EResult ProcessImageRotate (int32_t iType, uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight,
- uint8_t* pDst);
-
- private:
- SImageRotateFuncs m_pfRotateImage;
- int32_t m_iCPUFlag;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/imagerotate/imagerotatefuncs.cpp
+++ /dev/null
@@ -1,66 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * image_rotate.c
- *
- * Created on 11-2-21.
- *
- */
-
-#include "imagerotate.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void ImageRotate90D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
- for (uint32_t j = 0; j < iHeight; j++) {
- for (uint32_t i = 0; i < iWidth; i++) {
- for (uint32_t n = 0; n < uiBytesPerPixel; n++)
- pDst[ (i * iHeight + iHeight - 1 - j)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
- }
- }
-}
-void ImageRotate180D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
- for (uint32_t j = 0; j < iHeight; j++) {
- for (uint32_t i = 0; i < iWidth; i++) {
- for (uint32_t n = 0; n < uiBytesPerPixel; n++)
- pDst[ ((iHeight - 1 - j)*iWidth + iWidth - 1 - i)*uiBytesPerPixel + n] = pSrc[ (iWidth * j + i) * uiBytesPerPixel + n];
- }
- }
-}
-void ImageRotate270D_c (uint8_t* pSrc, uint32_t uiBytesPerPixel, uint32_t iWidth, uint32_t iHeight, uint8_t* pDst) {
- for (uint32_t j = 0; j < iWidth; j++) {
- for (uint32_t i = 0; i < iHeight; i++) {
- for (uint32_t n = 0; n < uiBytesPerPixel; n++)
- pDst[ ((iWidth - 1 - j)*iHeight + i)*uiBytesPerPixel + n] = pSrc[ (iWidth * i + j) * uiBytesPerPixel + n];
- }
- }
-}
-WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetection.cpp
+++ /dev/null
@@ -1,136 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "SceneChangeDetection.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define HIGH_MOTION_BLOCK_THRESHOLD 320
-#define SCENE_CHANGE_MOTION_RATIO 0.85f
-
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CSceneChangeDetection::CSceneChangeDetection (int32_t iCpuFlag) {
- m_iCpuFlag = iCpuFlag;
- m_eMethod = METHOD_SCENE_CHANGE_DETECTION;
- m_pfSad = NULL;
- WelsMemset (&m_sSceneChangeParam, 0, sizeof (m_sSceneChangeParam));
- InitSadFuncs (m_pfSad, m_iCpuFlag);
-}
-
-CSceneChangeDetection::~CSceneChangeDetection() {
-}
-
-EResult CSceneChangeDetection::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- EResult eReturn = RET_INVALIDPARAM;
-
- int32_t iWidth = pSrcPixMap->sRect.iRectWidth;
- int32_t iHeight = pSrcPixMap->sRect.iRectHeight;
- int32_t iBlock8x8Width = iWidth >> 3;
- int32_t iBlock8x8Height = iHeight >> 3;
- int32_t iBlock8x8Num = iBlock8x8Width * iBlock8x8Height;
- int32_t iSceneChangeThreshold = WelsStaticCast (int32_t, SCENE_CHANGE_MOTION_RATIO * iBlock8x8Num + 0.5f + PESN);
-
- int32_t iBlockSad = 0;
- int32_t iMotionBlockNum = 0;
-
- uint8_t* pRefY = NULL, *pCurY = NULL;
- int32_t iRefStride = 0, iCurStride = 0;
- int32_t iRefRowStride = 0, iCurRowStride = 0;
-
- uint8_t* pRefTmp = NULL, *pCurTmp = NULL;
-
- pRefY = (uint8_t*)pRefPixMap->pPixel[0];
- pCurY = (uint8_t*)pSrcPixMap->pPixel[0];
-
- iRefStride = pRefPixMap->iStride[0];
- iCurStride = pSrcPixMap->iStride[0];
-
- iRefRowStride = pRefPixMap->iStride[0] << 3;
- iCurRowStride = pSrcPixMap->iStride[0] << 3;
-
- m_sSceneChangeParam.bSceneChangeFlag = 0;
-
- for (int32_t j = 0; j < iBlock8x8Height; j ++) {
- pRefTmp = pRefY;
- pCurTmp = pCurY;
-
- for (int32_t i = 0; i < iBlock8x8Width; i++) {
- iBlockSad = m_pfSad (pRefTmp, iRefStride, pCurTmp, iCurStride);
-
- iMotionBlockNum += (iBlockSad > HIGH_MOTION_BLOCK_THRESHOLD);
-
- pRefTmp += 8;
- pCurTmp += 8;
- }
-
- pRefY += iRefRowStride;
- pCurY += iCurRowStride;
- }
-
- if (iMotionBlockNum >= iSceneChangeThreshold) {
- m_sSceneChangeParam.bSceneChangeFlag = 1;
- }
-
- eReturn = RET_SUCCESS;
-
- return eReturn;
-}
-
-
-EResult CSceneChangeDetection::Get (int32_t iType, void* pParam) {
- if (pParam == NULL) {
- return RET_INVALIDPARAM;
- }
-
- * (SSceneChangeResult*)pParam = m_sSceneChangeParam;
-
- return RET_SUCCESS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////
-
-void CSceneChangeDetection::InitSadFuncs (SadFuncPtr& pfSad, int32_t iCpuFlag) {
- pfSad = WelsSampleSad8x8_c;
-
-#ifdef X86_ASM
- if (iCpuFlag & WELS_CPU_SSE2) {
- pfSad = WelsSampleSad8x8_sse21;
- }
-#endif
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetection.h
+++ /dev/null
@@ -1,72 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
-* \file : SceneChangeDetection.h
-*
-* \brief : scene change detection class of wels video processor class
-*
-* \date : 2011/03/14
-*
-* \description : 1. rewrite the package code of scene change detection class
-*
-*************************************************************************************
-*/
-
-#ifndef WELSVP_SCENECHANGEDETECTION_H
-#define WELSVP_SCENECHANGEDETECTION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-#include "SceneChangeDetectionCommon.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-class CSceneChangeDetection : public IStrategy {
- public:
- CSceneChangeDetection (int32_t iCpuFlag);
- ~CSceneChangeDetection();
-
- EResult Process (int32_t iType, SPixMap* pSrc, SPixMap* pRef);
- EResult Get (int32_t iType, void* pParam);
-
- private:
- void InitSadFuncs (SadFuncPtr& pfSadFunc, int32_t iCpuFlag);
-
- private:
- SadFuncPtr m_pfSad;
- int32_t m_iCpuFlag;
- SSceneChangeResult m_sSceneChangeParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/scenechangedetection/SceneChangeDetectionCommon.cpp
+++ /dev/null
@@ -1,60 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "SceneChangeDetectionCommon.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-int32_t WelsSampleSad8x8_c (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY) {
- int32_t iSadSum = 0;
- uint8_t* pSrcA = pSrcY;
- uint8_t* pSrcB = pRefY;
- for (int32_t i = 0; i < 8; i++) {
- iSadSum += WELS_ABS ((pSrcA[0] - pSrcB[0]));
- iSadSum += WELS_ABS ((pSrcA[1] - pSrcB[1]));
- iSadSum += WELS_ABS ((pSrcA[2] - pSrcB[2]));
- iSadSum += WELS_ABS ((pSrcA[3] - pSrcB[3]));
- iSadSum += WELS_ABS ((pSrcA[4] - pSrcB[4]));
- iSadSum += WELS_ABS ((pSrcA[5] - pSrcB[5]));
- iSadSum += WELS_ABS ((pSrcA[6] - pSrcB[6]));
- iSadSum += WELS_ABS ((pSrcA[7] - pSrcB[7]));
-
- pSrcA += iSrcStrideY;
- pSrcB += iRefStrideY;
- }
-
- return iSadSum;
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/scenechangedetection/SceneChangeDetectionCommon.h
+++ /dev/null
@@ -1,65 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : SceneChangeDetectionCommon.h
- *
- * \brief : scene change detection class of wels video processor class
- *
- * \date : 2011/03/14
- *
- * \description : 1. rewrite the package code of scene change detection class
- *
- */
-
-#ifndef WELSVP_SCENECHANGEDETECTIONCOMMON_H
-#define WELSVP_SCENECHANGEDETECTIONCOMMON_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef int32_t (SadFunc) (uint8_t* pSrcY, int32_t iSrcStrideY, uint8_t* pRefY, int32_t iRefStrideY);
-
-typedef SadFunc* SadFuncPtr;
-
-SadFunc WelsSampleSad8x8_c;
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-SadFunc WelsSampleSad8x8_sse21;
-WELSVP_EXTERN_C_END
-#endif
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/src/vaacalc/vaacalcfuncs.cpp
+++ /dev/null
@@ -1,595 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "../common/typedef.h"
-#include "../common/util.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-void VAACalcSadSsd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16) {
- uint8_t* tmp_ref = pRefData;
- uint8_t* tmp_cur = pCurData;
- int32_t iMbWidth = (iPicWidth >> 4);
- int32_t mb_heigth = (iPicHeight >> 4);
- int32_t mb_index = 0;
- int32_t pic_stride_x8 = iPicStride << 3;
- int32_t step = (iPicStride << 4) - iPicWidth;
-
- *pFrameSad = 0;
- for (int32_t i = 0; i < mb_heigth; i ++) {
- for (int32_t j = 0; j < iMbWidth; j ++) {
- int32_t k, l;
- int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
- uint8_t* tmp_cur_row;
- uint8_t* tmp_ref_row;
-
- pSum16x16[mb_index] = 0;
- psqsum16x16[mb_index] = 0;
- psqdiff16x16[mb_index] = 0;
-
- l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur;
- tmp_ref_row = tmp_ref;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sqdiff += diff * diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 0] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
-
- l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + 8;
- tmp_ref_row = tmp_ref + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sqdiff += diff * diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 1] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
-
- l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8;
- tmp_ref_row = tmp_ref + pic_stride_x8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sqdiff += diff * diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 2] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
-
- l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
- tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sqdiff += diff * diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 3] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
-
-
- tmp_ref += 16;
- tmp_cur += 16;
- ++mb_index;
- }
- tmp_ref += step;
- tmp_cur += step;
- }
-}
-void VAACalcSadVar_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16) {
- uint8_t* tmp_ref = pRefData;
- uint8_t* tmp_cur = pCurData;
- int32_t iMbWidth = (iPicWidth >> 4);
- int32_t mb_heigth = (iPicHeight >> 4);
- int32_t mb_index = 0;
- int32_t pic_stride_x8 = iPicStride << 3;
- int32_t step = (iPicStride << 4) - iPicWidth;
-
- *pFrameSad = 0;
- for (int32_t i = 0; i < mb_heigth; i ++) {
- for (int32_t j = 0; j < iMbWidth; j ++) {
- int32_t k, l;
- int32_t l_sad, l_sum, l_sqsum;
- uint8_t* tmp_cur_row;
- uint8_t* tmp_ref_row;
-
- pSum16x16[mb_index] = 0;
- psqsum16x16[mb_index] = 0;
-
- l_sad = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur;
- tmp_ref_row = tmp_ref;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 0] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
-
- l_sad = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + 8;
- tmp_ref_row = tmp_ref + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 1] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
-
- l_sad = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8;
- tmp_ref_row = tmp_ref + pic_stride_x8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 2] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
-
- l_sad = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
- tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 3] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
-
-
- tmp_ref += 16;
- tmp_cur += 16;
- ++mb_index;
- }
- tmp_ref += step;
- tmp_cur += step;
- }
-}
-
-
-void VAACalcSad_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8) {
- uint8_t* tmp_ref = pRefData;
- uint8_t* tmp_cur = pCurData;
- int32_t iMbWidth = (iPicWidth >> 4);
- int32_t mb_heigth = (iPicHeight >> 4);
- int32_t mb_index = 0;
- int32_t pic_stride_x8 = iPicStride << 3;
- int32_t step = (iPicStride << 4) - iPicWidth;
-
- *pFrameSad = 0;
- for (int32_t i = 0; i < mb_heigth; i ++) {
- for (int32_t j = 0; j < iMbWidth; j ++) {
- int32_t k, l;
- int32_t l_sad;
- uint8_t* tmp_cur_row;
- uint8_t* tmp_ref_row;
-
- l_sad = 0;
- tmp_cur_row = tmp_cur;
- tmp_ref_row = tmp_ref;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 0] = l_sad;
-
- l_sad = 0;
- tmp_cur_row = tmp_cur + 8;
- tmp_ref_row = tmp_ref + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 1] = l_sad;
-
- l_sad = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8;
- tmp_ref_row = tmp_ref + pic_stride_x8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 2] = l_sad;
-
- l_sad = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
- tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
- l_sad += diff;
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 3] = l_sad;
-
- tmp_ref += 16;
- tmp_cur += 16;
- ++mb_index;
- }
- tmp_ref += step;
- tmp_cur += step;
- }
-}
-
-void VAACalcSadSsdBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
- int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16, int32_t* pSd8x8,
- uint8_t* pMad8x8)
-
-{
- uint8_t* tmp_ref = pRefData;
- uint8_t* tmp_cur = pCurData;
- int32_t iMbWidth = (iPicWidth >> 4);
- int32_t mb_heigth = (iPicHeight >> 4);
- int32_t mb_index = 0;
- int32_t pic_stride_x8 = iPicStride << 3;
- int32_t step = (iPicStride << 4) - iPicWidth;
-
- *pFrameSad = 0;
- for (int32_t i = 0; i < mb_heigth; i ++) {
- for (int32_t j = 0; j < iMbWidth; j ++) {
- int32_t k, l;
- int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
- uint8_t* tmp_cur_row;
- uint8_t* tmp_ref_row;
-
- pSum16x16[mb_index] = 0;
- psqsum16x16[mb_index] = 0;
- psqdiff16x16[mb_index] = 0;
-
- l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur;
- tmp_ref_row = tmp_ref;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
-
- l_sd += diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- l_sad += abs_diff;
- l_sqdiff += abs_diff * abs_diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 0] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
- pSd8x8[ (mb_index << 2) + 0] = l_sd;
- pMad8x8[ (mb_index << 2) + 0] = l_mad;
-
-
- l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + 8;
- tmp_ref_row = tmp_ref + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
-
- l_sd += diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- l_sad += abs_diff;
- l_sqdiff += abs_diff * abs_diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 1] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
- pSd8x8[ (mb_index << 2) + 1] = l_sd;
- pMad8x8[ (mb_index << 2) + 1] = l_mad;
-
- l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8;
- tmp_ref_row = tmp_ref + pic_stride_x8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
-
- l_sd += diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- l_sad += abs_diff;
- l_sqdiff += abs_diff * abs_diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 2] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
- pSd8x8[ (mb_index << 2) + 2] = l_sd;
- pMad8x8[ (mb_index << 2) + 2] = l_mad;
-
- l_sd = l_mad = l_sad = l_sqdiff = l_sum = l_sqsum = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
- tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
-
- l_sd += diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- l_sad += abs_diff;
- l_sqdiff += abs_diff * abs_diff;
- l_sum += tmp_cur_row[l];
- l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 3] = l_sad;
- pSum16x16[mb_index] += l_sum;
- psqsum16x16[mb_index] += l_sqsum;
- psqdiff16x16[mb_index] += l_sqdiff;
- pSd8x8[ (mb_index << 2) + 3] = l_sd;
- pMad8x8[ (mb_index << 2) + 3] = l_mad;
-
- tmp_ref += 16;
- tmp_cur += 16;
- ++mb_index;
- }
- tmp_ref += step;
- tmp_cur += step;
- }
-}
-
-void VAACalcSadBgd_c (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight, int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8) {
- uint8_t* tmp_ref = pRefData;
- uint8_t* tmp_cur = pCurData;
- int32_t iMbWidth = (iPicWidth >> 4);
- int32_t mb_heigth = (iPicHeight >> 4);
- int32_t mb_index = 0;
- int32_t pic_stride_x8 = iPicStride << 3;
- int32_t step = (iPicStride << 4) - iPicWidth;
-
- *pFrameSad = 0;
- for (int32_t i = 0; i < mb_heigth; i ++) {
- for (int32_t j = 0; j < iMbWidth; j ++) {
- int32_t k, l;
- int32_t l_sad, l_sd, l_mad;
- uint8_t* tmp_cur_row;
- uint8_t* tmp_ref_row;
-
- l_mad = l_sd = l_sad = 0;
- tmp_cur_row = tmp_cur;
- tmp_ref_row = tmp_ref;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
- l_sd += diff;
- l_sad += abs_diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 0] = l_sad;
- pSd8x8[ (mb_index << 2) + 0] = l_sd;
- pMad8x8[ (mb_index << 2) + 0] = l_mad;
-
- l_mad = l_sd = l_sad = 0;
- tmp_cur_row = tmp_cur + 8;
- tmp_ref_row = tmp_ref + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
- l_sd += diff;
- l_sad += abs_diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 1] = l_sad;
- pSd8x8[ (mb_index << 2) + 1] = l_sd;
- pMad8x8[ (mb_index << 2) + 1] = l_mad;
-
- l_mad = l_sd = l_sad = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8;
- tmp_ref_row = tmp_ref + pic_stride_x8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
- l_sd += diff;
- l_sad += abs_diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 2] = l_sad;
- pSd8x8[ (mb_index << 2) + 2] = l_sd;
- pMad8x8[ (mb_index << 2) + 2] = l_mad;
-
- l_mad = l_sd = l_sad = 0;
- tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
- tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
- for (k = 0; k < 8; k ++) {
- for (l = 0; l < 8; l ++) {
- int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
- int32_t abs_diff = WELS_ABS (diff);
- l_sd += diff;
- l_sad += abs_diff;
- if (abs_diff > l_mad) {
- l_mad = abs_diff;
- }
- }
- tmp_cur_row += iPicStride;
- tmp_ref_row += iPicStride;
- }
- *pFrameSad += l_sad;
- pSad8x8[ (mb_index << 2) + 3] = l_sad;
- pSd8x8[ (mb_index << 2) + 3] = l_sd;
- pMad8x8[ (mb_index << 2) + 3] = l_mad;
-
- tmp_ref += 16;
- tmp_cur += 16;
- ++mb_index;
- }
- tmp_ref += step;
- tmp_cur += step;
- }
-}
-
-WELSVP_NAMESPACE_END
--- a/processing/src/vaacalc/vaacalculation.cpp
+++ /dev/null
@@ -1,123 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- */
-
-#include "vaacalculation.h"
-#include "../common/cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CVAACalculation::CVAACalculation (int32_t iCpuFlag) {
- m_iCPUFlag = iCpuFlag;
- m_eMethod = METHOD_VAA_STATISTICS;
-
- WelsMemset (&m_sCalcParam, 0, sizeof (m_sCalcParam));
- WelsMemset (&m_sVaaFuncs, 0, sizeof (m_sVaaFuncs));
- InitVaaFuncs (m_sVaaFuncs, m_iCPUFlag);
-}
-
-CVAACalculation::~CVAACalculation() {
-}
-
-void CVAACalculation::InitVaaFuncs (SVaaFuncs& sVaaFuncs, int32_t iCpuFlag) {
- sVaaFuncs.pfVAACalcSad = VAACalcSad_c;
- sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_c;
- sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_c;
- sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_c;
- sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_c;
-#ifdef X86_ASM
- if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
- sVaaFuncs.pfVAACalcSad = VAACalcSad_sse2;
- sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_sse2;
- sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_sse2;
- sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
- sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
- }
-#endif//X86_ASM
-}
-
-EResult CVAACalculation::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
- uint8_t* pCurData = (uint8_t*)pSrcPixMap->pPixel[0];
- uint8_t* pRefData = (uint8_t*)pRefPixMap->pPixel[0];
- int32_t iPicWidth = pSrcPixMap->sRect.iRectWidth;
- int32_t iPicHeight = pSrcPixMap->sRect.iRectHeight;
- int32_t iPicStride = pSrcPixMap->iStride[0];
-
- SVAACalcResult* pResult = m_sCalcParam.pCalcResult;
-
- if (pCurData == NULL || pRefData == NULL) {
- return RET_INVALIDPARAM;
- }
-
- pResult->pCurY = pCurData;
- pResult->pRefY = pRefData;
- if (m_sCalcParam.iCalcBgd) {
- if (m_sCalcParam.iCalcSsd) {
- m_sVaaFuncs.pfVAACalcSadSsdBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
- (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16,
- (int32_t*)pResult->pSumOfDiff8x8, (uint8_t*)pResult->pMad8x8);
- } else {
- m_sVaaFuncs.pfVAACalcSadBgd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
- (int32_t*) (pResult->pSad8x8), (int32_t*) (pResult->pSumOfDiff8x8), (uint8_t*)pResult->pMad8x8);
- }
- } else {
- if (m_sCalcParam.iCalcSsd) {
- m_sVaaFuncs.pfVAACalcSadSsd (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
- (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16, pResult->pSsd16x16);
- } else {
- if (m_sCalcParam.iCalcVar) {
- m_sVaaFuncs.pfVAACalcSadVar (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
- (int32_t*)pResult->pSad8x8, pResult->pSum16x16, pResult->pSumOfSquare16x16);
- } else {
- m_sVaaFuncs.pfVAACalcSad (pCurData, pRefData, iPicWidth, iPicHeight, iPicStride, &pResult->iFrameSad,
- (int32_t*)pResult->pSad8x8);
- }
- }
- }
-
- return RET_SUCCESS;
-}
-
-EResult CVAACalculation::Set (int32_t iType, void* pParam) {
- if (pParam == NULL || ((SVAACalcParam*)pParam)->pCalcResult == NULL) {
- return RET_INVALIDPARAM;
- }
-
- m_sCalcParam = * (SVAACalcParam*)pParam;
-
- return RET_SUCCESS;
-}
-
-
-WELSVP_NAMESPACE_END
--- a/processing/src/vaacalc/vaacalculation.h
+++ /dev/null
@@ -1,125 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2011-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file : vaacalculation.h
- *
- * \brief : pVaa calculation class of wels video processor class
- *
- * \date : 2011/03/18
- *
- * \description : 1. rewrite the package code of pVaa calculation class
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_VAACALCULATION_H
-#define WELSVP_VAACALCULATION_H
-
-#include "../common/util.h"
-#include "../common/memory.h"
-#include "../common/WelsFrameWork.h"
-#include "../../interface/IWelsVP.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-typedef void (VAACalcSadBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
- int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8);
-
-typedef void (VAACalcSadSsdBgdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
- int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16,
- int32_t* pSsd16x16, int32_t* pSd8x8, uint8_t* pMad8x8);
-
-typedef void (VAACalcSadFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
- int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8);
-
-typedef void (VAACalcSadVarFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
- int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16);
-
-typedef void (VAACalcSadSsdFunc) (uint8_t* pCurData, uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
- int32_t iPicStride,
- int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* pSumSquare16x16, int32_t* pSsd16x16);
-
-
-typedef VAACalcSadBgdFunc* PVAACalcSadBgdFunc;
-typedef VAACalcSadSsdBgdFunc* PVAACalcSadSsdBgdFunc;
-typedef VAACalcSadFunc* PVAACalcSadFunc;
-typedef VAACalcSadVarFunc* PVAACalcSadVarFunc;
-typedef VAACalcSadSsdFunc* PVAACalcSadSsdFunc;
-
-typedef struct TagVaaFuncs {
- PVAACalcSadBgdFunc pfVAACalcSadBgd;
- PVAACalcSadSsdBgdFunc pfVAACalcSadSsdBgd;
- PVAACalcSadFunc pfVAACalcSad;
- PVAACalcSadVarFunc pfVAACalcSadVar;
- PVAACalcSadSsdFunc pfVAACalcSadSsd;
-} SVaaFuncs;
-
-
-VAACalcSadBgdFunc VAACalcSadBgd_c;
-VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_c;
-VAACalcSadFunc VAACalcSad_c;
-VAACalcSadVarFunc VAACalcSadVar_c;
-VAACalcSadSsdFunc VAACalcSadSsd_c;
-
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-VAACalcSadBgdFunc VAACalcSadBgd_sse2;
-VAACalcSadSsdBgdFunc VAACalcSadSsdBgd_sse2;
-VAACalcSadFunc VAACalcSad_sse2;
-VAACalcSadVarFunc VAACalcSadVar_sse2;
-VAACalcSadSsdFunc VAACalcSadSsd_sse2;
-WELSVP_EXTERN_C_END
-#endif
-
-class CVAACalculation : public IStrategy {
- public:
- CVAACalculation (int32_t iCpuFlag);
- ~CVAACalculation();
-
- EResult Process (int32_t iType, SPixMap* pCurPixMap, SPixMap* pRefPixMap);
- EResult Set (int32_t iType, void* pParam);
-
- private:
- void InitVaaFuncs (SVaaFuncs& sVaaFunc, int32_t iCpuFlag);
-
- private:
- SVaaFuncs m_sVaaFuncs;
- int32_t m_iCPUFlag;
- SVAACalcParam m_sCalcParam;
-};
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/processing/targets.mk
+++ /dev/null
@@ -1,122 +1,0 @@
-PROCESSING_PREFIX=PROCESSING
-PROCESSING_SRCDIR=processing
-PROCESSING_CPP_SRCS=\
- $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp\
- $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp\
- $(PROCESSING_SRCDIR)/./src/common/cpu.cpp\
- $(PROCESSING_SRCDIR)/./src/common/memory.cpp\
- $(PROCESSING_SRCDIR)/./src/common/thread.cpp\
- $(PROCESSING_SRCDIR)/./src/common/util.cpp\
- $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp\
- $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp\
- $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp\
- $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp\
- $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp\
- $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp\
- $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp\
- $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp\
- $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp\
- $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp\
- $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp\
- $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp\
- $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp\
-
-PROCESSING_OBJS += $(PROCESSING_CPP_SRCS:.cpp=.o)
-ifeq ($(USE_ASM), Yes)
-PROCESSING_ASM_SRCS=\
- $(PROCESSING_SRCDIR)/./src/asm/asm_inc.asm\
- $(PROCESSING_SRCDIR)/./src/asm/cpuid.asm\
- $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm\
- $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm\
- $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm\
- $(PROCESSING_SRCDIR)/./src/asm/sad.asm\
- $(PROCESSING_SRCDIR)/./src/asm/vaa.asm\
-
-PROCESSING_OBJS += $(PROCESSING_ASM_SRCS:.asm=.o)
-endif
-
-OBJS += $(PROCESSING_OBJS)
-$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o: $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.o $(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp
-
-$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o: $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.o $(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/cpu.o: $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/cpu.o $(PROCESSING_SRCDIR)/./src/common/cpu.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/memory.o: $(PROCESSING_SRCDIR)/./src/common/memory.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/memory.o $(PROCESSING_SRCDIR)/./src/common/memory.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/thread.o: $(PROCESSING_SRCDIR)/./src/common/thread.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/thread.o $(PROCESSING_SRCDIR)/./src/common/thread.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/util.o: $(PROCESSING_SRCDIR)/./src/common/util.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/util.o $(PROCESSING_SRCDIR)/./src/common/util.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWork.cpp
-
-$(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o: $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.o $(PROCESSING_SRCDIR)/./src/common/WelsFrameWorkEx.cpp
-
-$(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o: $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.o $(PROCESSING_SRCDIR)/./src/complexityanalysis/ComplexityAnalysis.cpp
-
-$(PROCESSING_SRCDIR)/./src/denoise/denoise.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise.o $(PROCESSING_SRCDIR)/./src/denoise/denoise.cpp
-
-$(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o: $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.o $(PROCESSING_SRCDIR)/./src/denoise/denoise_filter.cpp
-
-$(PROCESSING_SRCDIR)/./src/downsample/downsample.o: $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsample.o $(PROCESSING_SRCDIR)/./src/downsample/downsample.cpp
-
-$(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o: $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.o $(PROCESSING_SRCDIR)/./src/downsample/downsamplefuncs.cpp
-
-$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotate.cpp
-
-$(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o: $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.o $(PROCESSING_SRCDIR)/./src/imagerotate/imagerotatefuncs.cpp
-
-$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetection.cpp
-
-$(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o: $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.o $(PROCESSING_SRCDIR)/./src/scenechangedetection/SceneChangeDetectionCommon.cpp
-
-$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalcfuncs.cpp
-
-$(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o: $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
- $(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(PROCESSING_CFLAGS) $(PROCESSING_INCLUDES) -c -o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.o $(PROCESSING_SRCDIR)/./src/vaacalc/vaacalculation.cpp
-
-$(PROCESSING_SRCDIR)/./src/asm/asm_inc.o: $(PROCESSING_SRCDIR)/./src/asm/asm_inc.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/asm_inc.o $(PROCESSING_SRCDIR)/./src/asm/asm_inc.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/cpuid.o: $(PROCESSING_SRCDIR)/./src/asm/cpuid.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/cpuid.o $(PROCESSING_SRCDIR)/./src/asm/cpuid.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o: $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.o $(PROCESSING_SRCDIR)/./src/asm/denoisefilter.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o: $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.o $(PROCESSING_SRCDIR)/./src/asm/downsample_bilinear.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/intra_pred.o: $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.o $(PROCESSING_SRCDIR)/./src/asm/intra_pred.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/sad.o: $(PROCESSING_SRCDIR)/./src/asm/sad.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/sad.o $(PROCESSING_SRCDIR)/./src/asm/sad.asm
-
-$(PROCESSING_SRCDIR)/./src/asm/vaa.o: $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
- $(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(PROCESSING_ASMFLAGS) $(PROCESSING_ASM_INCLUDES) -o $(PROCESSING_SRCDIR)/./src/asm/vaa.o $(PROCESSING_SRCDIR)/./src/asm/vaa.asm
-
-$(LIBPREFIX)processing.$(LIBSUFFIX): $(PROCESSING_OBJS)
- rm -f $(LIBPREFIX)processing.$(LIBSUFFIX)
- $(AR) cr $@ $(PROCESSING_OBJS)
-
-libraries: $(LIBPREFIX)processing.$(LIBSUFFIX)
-LIBRARIES += $(LIBPREFIX)processing.$(LIBSUFFIX)