ref: 5c9f447c0ea19870c48b27c2d1d43af2d42eb579
parent: ae027b83d8f90c0517a37927d28e3483e69747c4
author: volvet <[email protected]>
date: Tue Jan 21 06:16:48 EST 2014
fix win64 float issue, enable AQ assembly
--- a/codec/build/win32/enc/WelsEncPlus.vcproj
+++ b/codec/build/win32/enc/WelsEncPlus.vcproj
@@ -53,7 +53,7 @@
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
- PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
@@ -118,9 +118,9 @@
/>
</Configuration>
<Configuration
- Name="Release|Win32"
- OutputDirectory=".\..\..\..\..\bin\win32\Release"
- IntermediateDirectory=".\..\..\..\obj\encoder\plus\Release"
+ Name="Debug|x64"
+ OutputDirectory=".\..\..\..\..\bin\win64\Debug"
+ IntermediateDirectory=".\..\..\..\obj\encoder\plus\Debug"
ConfigurationType="2"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
UseOfMFC="0"
@@ -141,29 +141,25 @@
/>
<Tool
Name="VCMIDLTool"
- PreprocessorDefinitions="NDEBUG"
+ PreprocessorDefinitions="_DEBUG"
MkTypLibCompatible="true"
SuppressStartupBanner="true"
- TargetEnvironment="1"
- TypeLibraryName=".\..\..\..\..\..\bin\Release/WelsEncPlus.tlb"
+ TargetEnvironment="3"
+ TypeLibraryName=".\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb"
HeaderFileName=""
/>
<Tool
Name="VCCLCompilerTool"
- Optimization="3"
- InlineFunctionExpansion="2"
- FavorSizeOrSpeed="1"
- EnableFiberSafeOptimizations="true"
- WholeProgramOptimization="true"
+ Optimization="0"
AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
- PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;"
- StringPooling="true"
- RuntimeLibrary="2"
- EnableFunctionLevelLinking="true"
- PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch"
- AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Release/"
- ObjectFile=".\..\..\..\obj\encoder\plus\Release/"
- ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Release/"
+ PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch"
+ AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Debug/"
+ ObjectFile=".\..\..\..\obj\encoder\plus\Debug/"
+ ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Debug/"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="3"
@@ -173,7 +169,7 @@
/>
<Tool
Name="VCResourceCompilerTool"
- PreprocessorDefinitions="NDEBUG"
+ PreprocessorDefinitions="_DEBUG"
Culture="1033"
/>
<Tool
@@ -181,22 +177,20 @@
/>
<Tool
Name="VCLinkerTool"
- AdditionalOptions="/MAPINFO:exports /LTCG"
AdditionalDependencies="$(OutDir)\welsecore.lib"
OutputFile="$(OutDir)\welsenc.dll"
- LinkIncremental="1"
+ LinkIncremental="2"
SuppressStartupBanner="true"
AdditionalLibraryDirectories="..\..\..\..\libs"
ModuleDefinitionFile="..\..\..\encoder\plus\src\wels_enc_export.def"
GenerateDebugInformation="true"
ProgramDatabaseFile="$(OutDir)\welsenc.pdb"
- GenerateMapFile="false"
- MapFileName=""
- MapExports="false"
+ GenerateMapFile="true"
+ MapFileName="$(OutDir)\welsenc.map"
RandomizedBaseAddress="1"
DataExecutionPrevention="2"
ImportLibrary="$(OutDir)\welsenc.lib"
- TargetMachine="1"
+ TargetMachine="17"
/>
<Tool
Name="VCALinkTool"
@@ -223,9 +217,9 @@
/>
</Configuration>
<Configuration
- Name="Debug|x64"
- OutputDirectory=".\..\..\..\..\bin\win64\Debug"
- IntermediateDirectory=".\..\..\..\obj\encoder\plus\Debug"
+ Name="Release|Win32"
+ OutputDirectory=".\..\..\..\..\bin\win32\Release"
+ IntermediateDirectory=".\..\..\..\obj\encoder\plus\Release"
ConfigurationType="2"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC60.vsprops"
UseOfMFC="0"
@@ -246,25 +240,29 @@
/>
<Tool
Name="VCMIDLTool"
- PreprocessorDefinitions="_DEBUG"
+ PreprocessorDefinitions="NDEBUG"
MkTypLibCompatible="true"
SuppressStartupBanner="true"
- TargetEnvironment="3"
- TypeLibraryName=".\..\..\..\..\..\bin\Debug/WelsEncPlus.tlb"
+ TargetEnvironment="1"
+ TypeLibraryName=".\..\..\..\..\..\bin\Release/WelsEncPlus.tlb"
HeaderFileName=""
/>
<Tool
Name="VCCLCompilerTool"
- Optimization="0"
+ Optimization="3"
+ InlineFunctionExpansion="2"
+ FavorSizeOrSpeed="1"
+ EnableFiberSafeOptimizations="true"
+ WholeProgramOptimization="true"
AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
- PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED"
- MinimalRebuild="true"
- BasicRuntimeChecks="3"
- RuntimeLibrary="3"
- PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Debug/WelsEncPlus.pch"
- AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Debug/"
- ObjectFile=".\..\..\..\obj\encoder\plus\Debug/"
- ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Debug/"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
+ StringPooling="true"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ PrecompiledHeaderFile=".\..\..\..\obj\encoder\plus\Release/WelsEncPlus.pch"
+ AssemblerListingLocation=".\..\..\..\obj\encoder\plus\Release/"
+ ObjectFile=".\..\..\..\obj\encoder\plus\Release/"
+ ProgramDataBaseFileName=".\..\..\..\obj\encoder\plus\Release/"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="3"
@@ -274,7 +272,7 @@
/>
<Tool
Name="VCResourceCompilerTool"
- PreprocessorDefinitions="_DEBUG"
+ PreprocessorDefinitions="NDEBUG"
Culture="1033"
/>
<Tool
@@ -282,20 +280,22 @@
/>
<Tool
Name="VCLinkerTool"
+ AdditionalOptions="/MAPINFO:exports /LTCG"
AdditionalDependencies="$(OutDir)\welsecore.lib"
OutputFile="$(OutDir)\welsenc.dll"
- LinkIncremental="2"
+ LinkIncremental="1"
SuppressStartupBanner="true"
AdditionalLibraryDirectories="..\..\..\..\libs"
ModuleDefinitionFile="..\..\..\encoder\plus\src\wels_enc_export.def"
GenerateDebugInformation="true"
ProgramDatabaseFile="$(OutDir)\welsenc.pdb"
- GenerateMapFile="true"
- MapFileName="$(OutDir)\welsenc.map"
+ GenerateMapFile="false"
+ MapFileName=""
+ MapExports="false"
RandomizedBaseAddress="1"
DataExecutionPrevention="2"
ImportLibrary="$(OutDir)\welsenc.lib"
- TargetMachine="17"
+ TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
@@ -360,7 +360,7 @@
EnableFiberSafeOptimizations="true"
WholeProgramOptimization="true"
AdditionalIncludeDirectories="..\..\..\encoder\plus\inc;..\..\..\encoder\core\inc;..\..\..\api\svc;..\..\..\common;..\..\..\WelsThreadLib\api"
- PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;"
+ PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSENCPLUS_EXPORTS;MT_ENABLED;X86_ASM"
StringPooling="true"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
@@ -447,7 +447,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -456,7 +456,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -491,7 +491,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -500,7 +500,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -531,7 +531,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
@@ -540,7 +540,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
@@ -589,7 +589,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Release|Win32"
+ Name="Debug|x64"
>
<Tool
Name="VCResourceCompilerTool"
@@ -598,7 +598,7 @@
/>
</FileConfiguration>
<FileConfiguration
- Name="Debug|x64"
+ Name="Release|Win32"
>
<Tool
Name="VCResourceCompilerTool"
--- a/codec/common/cpu.cpp
+++ b/codec/common/cpu.cpp
@@ -204,6 +204,9 @@
}
}
+void WelsXmmRegEmptyOp(void * pSrc) {
+}
+
#endif
--- a/codec/common/cpu.h
+++ b/codec/common/cpu.h
@@ -41,9 +41,9 @@
#define WELS_CPU_DETECTION_H__
#include "typedefs.h"
+#include "cpu_core.h"
-
#if defined(__cplusplus)
extern "C" {
#endif//__cplusplus
@@ -69,12 +69,56 @@
*/
void WelsCPURestore (const uint32_t kuiCPU);
+#ifdef WIN64
+void WelsXmmRegStore(void * src);
+void WelsXmmRegLoad(void * src);
#endif
+#endif
+
+void WelsXmmRegEmptyOp(void * pSrc);
+
#if defined(__cplusplus)
}
#endif//__cplusplus
+typedef void (*WelsXmmRegProtectFunc)(void * pSrc);
+
+#ifdef WIN64
+#define XMMREG_PROTECT_DECLARE(name) \
+ WelsXmmRegProtectFunc name##load;\
+ WelsXmmRegProtectFunc name##store;\
+ uint8_t name##Buffer[160];
+
+#define XMMREG_PROTECT_INIT(name) \
+ { \
+ uint32_t uiCpuFlag = WelsCPUFeatureDetect(NULL);\
+ if( uiCpuFlag & WELS_CPU_SSE2 ){\
+ name##load = WelsXmmRegLoad;\
+ name##store = WelsXmmRegStore; \
+ } else { \
+ name##load = WelsXmmRegEmptyOp; \
+ name##store = WelsXmmRegEmptyOp; \
+ } \
+ }
+
+#define XMMREG_PROTECT_UNINIT(name) \
+
+#define XMMREG_PROTECT_STORE(name) \
+ name##store(name##Buffer);
+
+#define XMMREG_PROTECT_LOAD(name) \
+ name##load(name##Buffer);
+
+#else
+
+#define XMMREG_PROTECT_DECLARE(name)
+#define XMMREG_PROTECT_INIT(name)
+#define XMMREG_PROTECT_UNINIT(name)
+#define XMMREG_PROTECT_STORE(name)
+#define XMMREG_PROTECT_LOAD(name)
+
+#endif
#endif//WELS_CPU_DETECTION_H__
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -221,4 +221,43 @@
ret
+%ifdef WIN64
+
+WELS_EXTERN WelsXmmRegStore
+ALIGN 16
+;******************************************************************************************
+; void WelsXmmRegStore(void *src)
+;******************************************************************************************
+WelsXmmRegStore:
+ movdqu [rcx], xmm6
+ movdqu [rcx+16], xmm7
+ movdqu [rcx+32], xmm8
+ movdqu [rcx+48], xmm9
+ movdqu [rcx+64], xmm10
+ movdqu [rcx+80], xmm11
+ movdqu [rcx+96], xmm12
+ movdqu [rcx+112], xmm13
+ movdqu [rcx+128], xmm14
+ movdqu [rcx+144], xmm15
+ ret
+
+WELS_EXTERN WelsXmmRegLoad
+ALIGN 16
+;******************************************************************************************
+; void WelsXmmRegLoad(void *src)
+;******************************************************************************************
+WelsXmmRegLoad:
+ movdqu xmm6, [rcx]
+ movdqu xmm7, [rcx+16]
+ movdqu xmm8, [rcx+32]
+ movdqu xmm9, [rcx+48]
+ movdqu xmm10, [rcx+64]
+ movdqu xmm11, [rcx+80]
+ movdqu xmm12, [rcx+96]
+ movdqu xmm13, [rcx+112]
+ movdqu xmm14, [rcx+128]
+ movdqu xmm15, [rcx+144]
+ ret
+%endif
+
--- a/codec/encoder/plus/inc/welsEncoderExt.h
+++ b/codec/encoder/plus/inc/welsEncoderExt.h
@@ -49,6 +49,7 @@
#include "encoder_context.h"
#include "param_svc.h"
#include "extern.h"
+#include "cpu.h"
//#define OUTPUT_BIT_STREAM
//#define DUMP_SRC_PICTURE
@@ -129,6 +130,8 @@
void InitEncoder (void);
int32_t RawData2SrcPic (const uint8_t* pSrc);
void DumpSrcPicture (const uint8_t* pSrc);
+
+ XMMREG_PROTECT_DECLARE(CWelsH264SVCEncoder);
};
}
#endif // !defined(AFX_WELSH264ENCODER_H__D9FAA1D1_5403_47E1_8E27_78F11EE65F02__INCLUDED_)
--- a/codec/encoder/plus/src/welsEncoderExt.cpp
+++ b/codec/encoder/plus/src/welsEncoderExt.cpp
@@ -218,6 +218,7 @@
#endif//OUTPUT_BIT_STREAM
InitEncoder();
+ XMMREG_PROTECT_INIT(CWelsH264SVCEncoder);
}
CWelsH264SVCEncoder::~CWelsH264SVCEncoder() {
@@ -253,6 +254,7 @@
#endif//OUTPUT_BIT_STREAM
Uninitialize();
+ XMMREG_PROTECT_UNINIT(CWelsH264SVCEncoder);
}
void CWelsH264SVCEncoder::InitEncoder (void) {
@@ -628,7 +630,9 @@
int32_t iFrameType = videoFrameTypeInvalid;
if (nSrcPicNum > 0) {
+ XMMREG_PROTECT_STORE(CWelsH264SVCEncoder);
iFrameTypeReturned = WelsEncoderEncodeExt (m_pEncContext, pBsInfo, pSrcPicList, nSrcPicNum);
+ XMMREG_PROTECT_LOAD(CWelsH264SVCEncoder);
} else {
assert (0);
return videoFrameTypeInvalid;
binary files a/codec/processing/build/win32/WelsVP_2008.suo b/codec/processing/build/win32/WelsVP_2008.suo differ
--- a/codec/processing/build/win32/WelsVP_2008.vcproj
+++ b/codec/processing/build/win32/WelsVP_2008.vcproj
@@ -137,7 +137,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
- AdditionalIncludeDirectories=""
+ AdditionalIncludeDirectories="../../../common/"
PreprocessorDefinitions="WIN64;_DEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@@ -313,6 +313,7 @@
Optimization="3"
EnableIntrinsicFunctions="false"
FavorSizeOrSpeed="1"
+ AdditionalIncludeDirectories="../../../common/"
PreprocessorDefinitions="WIN64;NDEBUG;_WINDOWS;_USRDLL;WELSVP_EXPORTS;X86_ASM"
RuntimeLibrary="0"
EnableFunctionLevelLinking="false"
@@ -378,7 +379,7 @@
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
- RelativePath="..\..\src\common\cpu.cpp"
+ RelativePath="..\..\..\common\cpu.cpp"
>
</File>
<File
@@ -496,7 +497,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -514,7 +515,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -545,7 +546,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -576,7 +577,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -594,7 +595,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -607,7 +608,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -616,7 +617,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -625,7 +626,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -634,7 +635,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -647,7 +648,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -656,7 +657,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -665,7 +666,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win32 -DPREFIX -DX86_32 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -674,7 +675,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -696,7 +697,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
@@ -714,7 +715,7 @@
>
<Tool
Name="VCCustomBuildTool"
- CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)"
+ CommandLine="nasm -I$(InputDir) -I$(InputDir)/../../../common/ -f win64 -O3 -DWIN64 -o $(IntDir)\$(InputName).obj $(InputPath)
"
Outputs="$(IntDir)\$(InputName).obj"
/>
</FileConfiguration>
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.cpp
@@ -50,9 +50,11 @@
m_pfVar = NULL;
WelsMemset (&m_sAdaptiveQuantParam, 0, sizeof (m_sAdaptiveQuantParam));
WelsInitVarFunc (m_pfVar, m_CPUFlag);
+ XMMREG_PROTECT_INIT(AdaptiveQuantization);
}
CAdaptiveQuantization::~CAdaptiveQuantization() {
+ XMMREG_PROTECT_UNINIT(AdaptiveQuantization);
}
EResult CAdaptiveQuantization::Process (int32_t iType, SPixMap* pSrcPixMap, SPixMap* pRefPixMap) {
@@ -101,6 +103,7 @@
pRefFrameTmp = pRefFrameY;
pCurFrameTmp = pCurFrameY;
for (i = 0; i < iMbWidth; i++) {
+ XMMREG_PROTECT_STORE(AdaptiveQuantization);
iSumDiff = pVaaCalcResults->pSad8x8[iMbIndex][0];
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][1];
iSumDiff += pVaaCalcResults->pSad8x8[iMbIndex][2];
@@ -109,6 +112,7 @@
iSQDiff = pVaaCalcResults->pSsd16x16[iMbIndex];
uiSum = pVaaCalcResults->pSum16x16[iMbIndex];
iSQSum = pVaaCalcResults->pSumOfSquare16x16[iMbIndex];
+ XMMREG_PROTECT_LOAD(AdaptiveQuantization);
iSumDiff = iSumDiff >> 8;
pMotionTexture->uiMotionIndex = (iSQDiff >> 8) - (iSumDiff * iSumDiff);
@@ -131,7 +135,9 @@
pRefFrameTmp = pRefFrameY;
pCurFrameTmp = pCurFrameY;
for (i = 0; i < iMbWidth; i++) {
+ XMMREG_PROTECT_STORE(AdaptiveQuantization);
m_pfVar (pRefFrameTmp, iRefStride, pCurFrameTmp, iCurStride, pMotionTexture);
+ XMMREG_PROTECT_LOAD(AdaptiveQuantization);
dAverageMotionIndex += pMotionTexture->uiMotionIndex;
dAverageTextureIndex += pMotionTexture->uiTextureIndex;
pMotionTexture++;
@@ -223,7 +229,7 @@
#ifdef X86_ASM
if (iCpuFlag & WELS_CPU_SSE2) {
- // pfVar = SampleVariance16x16_sse2;
+ pfVar = SampleVariance16x16_sse2;
}
#endif
}
--- a/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
+++ b/codec/processing/src/adaptivequantization/AdaptiveQuantization.h
@@ -45,6 +45,7 @@
#include "../common/memory.h"
#include "../common/WelsFrameWork.h"
#include "../../interface/IWelsVP.h"
+#include "cpu.h"
WELSVP_NAMESPACE_BEGIN
@@ -78,6 +79,7 @@
PVarFunc m_pfVar;
int32_t m_CPUFlag;
SAdaptiveQuantizationParam m_sAdaptiveQuantParam;
+ XMMREG_PROTECT_DECLARE(AdaptiveQuantization);
};
WELSVP_NAMESPACE_END
--- a/codec/processing/src/asm/vaa.asm
+++ b/codec/processing/src/asm/vaa.asm
@@ -29,291 +29,211 @@
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
-;* vaa.asm
+;* vaa.asm
;*
-;* Abstract
+;* Abstract
;* sse2 for pVaa routines
;*
;* History
-;* 04/14/2010 Created
+;* 04/14/2010 Created
+;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
+;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
+;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
;*
;*************************************************************************/
%include "asm_inc.asm"
-%ifdef X86_32
+
+
;***********************************************************************
; Macros and other preprocessor constants
;***********************************************************************
+%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
+ movdqa %1, %2
+ punpcklbw %1, %3
+ punpckhbw %2, %3
+ pmaddwd %1, %1
+ pmaddwd %2, %2
+ paddd %1, %2
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddd %1, %2
+ pshufd %2, %1, 0B1h ; 10110001 B
+ paddd %1, %2
+%endmacro ; END OF SUM_SQR_SSE2
-;%macro SUM_SSE2 4 ; dst, pSrc, zero, pack1_8x2
-; movdqa %1, %2
-; punpcklbw %1, %3
-; punpckhbw %2, %3
-; paddw %1, %2
-; pmaddwd %1, %4
-; pshufd %2, %1, 04Eh ; 01001110 B
-; paddd %1, %2
-; pshufd %2, %1, 0B1h ; 10110001 B
-; paddd %1, %2
-;%endmacro ; END OF SUM_SSE2
+%macro WELS_SAD_16x2_SSE2 3 ;esi :%1 edi:%2 ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, [%1+%3]
+ movdqa xmm4, [%2+%3]
+ psadbw xmm1, xmm2
+ psadbw xmm3, xmm4
+ paddd xmm6, xmm1
+ paddd xmm6, xmm3
+ lea %1, [%1+%3*2]
+ lea %2, [%2+%3*2]
+%endmacro
; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
- ; @sum_8x2 begin
- pshufd %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 04Eh ; 01001110 B
- paddw %1, %2
- pshuflw %2, %1, 0B1h ; 10110001 B
- paddw %1, %2
- ; end of @sum_8x2
-%endmacro ; END of SUM_WORD_8x2_SSE2
+%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
+ ; @sum_8x2 begin
+ pshufd %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 04Eh ; 01001110 B
+ paddw %1, %2
+ pshuflw %2, %1, 0B1h ; 10110001 B
+ paddw %1, %2
+ ; end of @sum_8x2
+%endmacro ; END of SUM_WORD_8x2_SSE2
-%macro SUM_SQR_SSE2 3 ; dst, pSrc, zero
- movdqa %1, %2
- punpcklbw %1, %3
- punpckhbw %2, %3
- pmaddwd %1, %1
- pmaddwd %2, %2
- paddd %1, %2
- pshufd %2, %1, 04Eh ; 01001110 B
- paddd %1, %2
- pshufd %2, %1, 0B1h ; 10110001 B
- paddd %1, %2
-%endmacro ; END OF SUM_SQR_SSE2
+%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 3 ;esi:%1,edi:%2,ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm6, xmm3
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- pshufd %3, %1, 0B1h
- pshufd %4, %2, 0B1h
- paddw %1, %3
- paddw %2, %4
- movdqa %3, %1
- movdqa %4, %2
- pshuflw %5, %1, 0B1h
- pshufhw %6, %3, 0B1h
- paddw %1, %5
- paddw %3, %6
- pshuflw %5, %2, 0B1h
- pshufhw %6, %4, 0B1h
- paddw %2, %5
- paddw %4, %6
- punpcklwd %1, %2
- punpckhwd %3, %4
- punpcklwd %1, %3
- psraw %1, $04
-%endmacro
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd xmm5, xmm3
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
- movdqa %1, [esi ] ; line 0
- movdqa %2, [esi+ecx] ; line 1
- movdqa %3, %1
- punpcklbw %1, xmm7
- punpckhbw %3, xmm7
- movdqa %4, %2
- punpcklbw %4, xmm7
- punpckhbw %2, xmm7
- paddw %1, %4
- paddw %2, %3
- movdqa %3, [esi+ebx] ; line 2
- movdqa %4, [esi+edx] ; line 3
- movdqa %5, %3
- punpcklbw %3, xmm7
- punpckhbw %5, xmm7
- movdqa %6, %4
- punpcklbw %6, xmm7
- punpckhbw %4, xmm7
- paddw %3, %6
- paddw %4, %5
- paddw %1, %3 ; block 0, 1
- paddw %2, %4 ; block 2, 3
- phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
- phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
- psraw %1, $04
-%endmacro
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm4, xmm1
+ paddd xmm4, xmm2
-%macro WELS_SAD_16x2_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, [esi+ebx]
- movdqa xmm4, [edi+ebx]
- psadbw xmm1, xmm2
- psadbw xmm3, xmm4
- paddd xmm6, xmm1
- paddd xmm6, xmm3
- lea esi, [esi+ebx*2]
- lea edi, [edi+ebx*2]
+ add %1, %3
+ add %2, %3
%endmacro
-%macro WELS_SAD_SUM_SQSUM_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm6, xmm3
+%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 3 ;esi:%1 edi:%2 ebx:%3
+ movdqa xmm1, [%1]
+ movdqa xmm2, [%2]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm2
+ paddd xmm7, xmm3 ; sad
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd xmm5, xmm3
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; diff
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm4, xmm1
- paddd xmm4, xmm2
+ movdqa xmm2, xmm1
+ psadbw xmm2, xmm0
+ paddd xmm6, xmm2 ; sum
- add esi, ebx
- add edi, ebx
-%endmacro
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm2, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm5, xmm1
+ paddd xmm5, xmm2 ; sqsum
-%macro WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm2
- paddd xmm7, xmm3 ; sad
+ movdqa xmm1, xmm3
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm4, xmm1
+ paddd xmm4, xmm3 ; sqdiff
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; diff
-
- movdqa xmm2, xmm1
- psadbw xmm2, xmm0
- paddd xmm6, xmm2 ; sum
-
- movdqa xmm2, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm2, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm5, xmm1
- paddd xmm5, xmm2 ; sqsum
-
- movdqa xmm1, xmm3
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm4, xmm1
- paddd xmm4, xmm3 ; sqdiff
-
- add esi, ebx
- add edi, ebx
+ add %1, %3
+ add %2, %3
%endmacro
-%macro WELS_SAD_SD_MAD_16x1_SSE2 4
-%define sad_reg %1
-%define sum_cur_reg %2
-%define sum_ref_reg %3
-%define mad_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_cur_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- paddd sum_ref_reg, xmm3 ; sum_ref
+%macro WELS_SAD_SD_MAD_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg %1
+%define sum_cur_reg %2
+%define sum_ref_reg %3
+%define mad_reg %4
+ movdqa xmm1, [%5]
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_cur_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ paddd sum_ref_reg, xmm3 ; sum_ref
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
- add esi, ebx
- add edi, ebx
+ add %5, %7
+ add %6, %7
%endmacro
-%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
+%macro WELS_MAX_REG_SSE2 1 ; xmm1, xmm2, xmm3 can be used
%define max_reg %1
- movdqa xmm1, max_reg
- psrldq xmm1, 4
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 2
- pmaxub max_reg, xmm1
- movdqa xmm1, max_reg
- psrldq xmm1, 1
- pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 4
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 2
+ pmaxub max_reg, xmm1
+ movdqa xmm1, max_reg
+ psrldq xmm1, 1
+ pmaxub max_reg, xmm1
%endmacro
-%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 4
-%define sad_reg %1
-%define sum_reg %2
-%define mad_reg %3
-%define sqdiff_reg %4
- movdqa xmm1, [esi]
- movdqa xmm2, xmm1
- movdqa xmm3, xmm1
- punpcklbw xmm2, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psllq xmm2, 32
- psrlq xmm3, 32
- psllq xmm3, 32
- paddd xmm2, xmm3
- paddd sad_reg, xmm2 ; sqsum
+%macro WELS_SAD_BGD_SQDIFF_16x1_SSE2 7 ;esi:%5 edi:%6 ebx:%7
+%define sad_reg %1
+%define sum_reg %2
+%define mad_reg %3
+%define sqdiff_reg %4
+ movdqa xmm1, [%5]
+ movdqa xmm2, xmm1
+ movdqa xmm3, xmm1
+ punpcklbw xmm2, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm2, xmm3
+ movdqa xmm3, xmm2
+ psllq xmm2, 32
+ psrlq xmm3, 32
+ psllq xmm3, 32
+ paddd xmm2, xmm3
+ paddd sad_reg, xmm2 ; sqsum
- movdqa xmm2, [edi]
- movdqa xmm3, xmm1
- psadbw xmm3, xmm0
- paddd sum_reg, xmm3 ; sum_cur
- movdqa xmm3, xmm2
- psadbw xmm3, xmm0
- pslldq xmm3, 4
- paddd sum_reg, xmm3 ; sum_ref
+ movdqa xmm2, [%6]
+ movdqa xmm3, xmm1
+ psadbw xmm3, xmm0
+ paddd sum_reg, xmm3 ; sum_cur
+ movdqa xmm3, xmm2
+ psadbw xmm3, xmm0
+ pslldq xmm3, 4
+ paddd sum_reg, xmm3 ; sum_ref
- movdqa xmm3, xmm1
- pmaxub xmm3, xmm2
- pminub xmm2, xmm1
- psubb xmm3, xmm2 ; abs diff
- pmaxub mad_reg, xmm3 ; max abs diff
+ movdqa xmm3, xmm1
+ pmaxub xmm3, xmm2
+ pminub xmm2, xmm1
+ psubb xmm3, xmm2 ; abs diff
+ pmaxub mad_reg, xmm3 ; max abs diff
- movdqa xmm1, xmm3
- psadbw xmm3, xmm0
- paddd sad_reg, xmm3 ; sad
+ movdqa xmm1, xmm3
+ psadbw xmm3, xmm0
+ paddd sad_reg, xmm3 ; sad
- movdqa xmm3, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd sqdiff_reg, xmm1
- paddd sqdiff_reg, xmm3 ; sqdiff
+ movdqa xmm3, xmm1
+ punpcklbw xmm1, xmm0
+ punpckhbw xmm3, xmm0
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd sqdiff_reg, xmm1
+ paddd sqdiff_reg, xmm3 ; sqdiff
- add esi, ebx
- add edi, ebx
+ add %5, %7
+ add %6, %7
%endmacro
@@ -325,7 +245,7 @@
;ALIGN 16
;pack1_8x2:
-; dw 1, 1, 1, 1, 1, 1, 1, 1
+; dw 1, 1, 1, 1, 1, 1, 1, 1
;***********************************************************************
; Code
@@ -333,1082 +253,1805 @@
SECTION .text
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-; uint32_t rc_sad_frame_sse2( uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
- push esi
- push edi
- push ebp
- push ebx
- push edx
+%ifdef X86_32
- mov esi, [esp+24]
- mov edi, [esp+28]
- mov ebx, [esp+32]
- mov ecx, [esp+36]
- mov edx, [esp+40]
- pxor xmm0, xmm0
-.hloop:
- mov eax, ebx
- mov ebp, $00
-.wloop:
- movdqa xmm1, [esi+ebp]
- movdqa xmm2, [edi+ebp]
- psadbw xmm1, xmm2
- pshufd xmm2, xmm1, 0f6h ; 11110110 B ; movhlps for float
- paddd xmm1, xmm2
- paddd xmm0, xmm1
- add ebp, 010h
- dec eax
- jnz near .wloop
- lea esi, [esi+edx]
- lea edi, [edi+edx]
- dec ecx
- jnz near .hloop
-
- movd eax, xmm0
- pop edx
- pop ebx
- pop ebp
- pop edi
- pop esi
- ret
-
-
WELS_EXTERN SampleVariance16x16_sse2
;***********************************************************************
-; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
;***********************************************************************
ALIGN 16
SampleVariance16x16_sse2:
- push esi
- push edi
- push ebx
+ push esi
+ push edi
+ push ebx
- sub esp, 16
- %define SUM [esp]
- %define SUM_CUR [esp+4]
- %define SQR [esp+8]
- %define SQR_CUR [esp+12]
- %define PUSH_SIZE 28 ; 12 + 16
+ sub esp, 16
+ %define SUM [esp]
+ %define SUM_CUR [esp+4]
+ %define SQR [esp+8]
+ %define SQR_CUR [esp+12]
+ %define PUSH_SIZE 28 ; 12 + 16
- mov edi, [esp+PUSH_SIZE+4] ; y_ref
- mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
- mov esi, [esp+PUSH_SIZE+12] ; y_src
- mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
- mov ecx, 010h ; height = 16
+ mov edi, [esp+PUSH_SIZE+4] ; y_ref
+ mov edx, [esp+PUSH_SIZE+8] ; y_ref_stride
+ mov esi, [esp+PUSH_SIZE+12] ; y_src
+ mov eax, [esp+PUSH_SIZE+16] ; y_src_stride
+ mov ecx, 010h ; height = 16
- pxor xmm7, xmm7
- movdqu SUM, xmm7
+ pxor xmm7, xmm7
+ movdqu SUM, xmm7
.hloops:
- movdqa xmm0, [edi] ; y_ref
- movdqa xmm1, [esi] ; y_src
- movdqa xmm2, xmm0 ; store first for future process
- movdqa xmm3, xmm1
- ; sum += diff;
- movdqa xmm4, xmm0
- psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
- ; to be continued for sum
- pshufd xmm5, xmm4, 0C6h ; 11000110 B
- paddw xmm4, xmm5
- movd ebx, xmm4
- add SUM, ebx
+ movdqa xmm0, [edi] ; y_ref
+ movdqa xmm1, [esi] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd ebx, xmm4
+ add SUM, ebx
- ; sqr += diff * diff;
- pmaxub xmm0, xmm1
- pminub xmm1, xmm2
- psubb xmm0, xmm1 ; diff
- SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
- movd ebx, xmm1
- add SQR, ebx
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm1
+ add SQR, ebx
- ; sum_cur += y_src[x];
- movdqa xmm0, xmm3 ; cur_orig
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7
- punpckhbw xmm1, xmm7
- paddw xmm0, xmm1 ; 8x2
- SUM_WORD_8x2_SSE2 xmm0, xmm1
- movd ebx, xmm0
- and ebx, 0ffffh
- add SUM_CUR, ebx
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd ebx, xmm0
+ and ebx, 0ffffh
+ add SUM_CUR, ebx
- ; sqr_cur += y_src[x] * y_src[x];
- SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
- movd ebx, xmm0
- add SQR_CUR, ebx
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd ebx, xmm0
+ add SQR_CUR, ebx
- lea edi, [edi+edx]
- lea esi, [esi+eax]
- dec ecx
- jnz near .hloops
+ lea edi, [edi+edx]
+ lea esi, [esi+eax]
+ dec ecx
+ jnz near .hloops
- mov ebx, 0
- mov bx, word SUM
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR
- sar ecx, 8
- sub ecx, ebx
- mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
- mov [edi], cx ; to store uiMotionIndex
- mov ebx, 0
- mov bx, word SUM_CUR
- sar ebx, 8
- imul ebx, ebx
- mov ecx, SQR_CUR
- sar ecx, 8
- sub ecx, ebx
- mov [edi+2], cx ; to store uiTextureIndex
+ mov ebx, 0
+ mov bx, word SUM
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR
+ sar ecx, 8
+ sub ecx, ebx
+ mov edi, [esp+PUSH_SIZE+20] ; pMotionTexture
+ mov [edi], cx ; to store uiMotionIndex
+ mov ebx, 0
+ mov bx, word SUM_CUR
+ sar ebx, 8
+ imul ebx, ebx
+ mov ecx, SQR_CUR
+ sar ecx, 8
+ sub ecx, ebx
+ mov [edi+2], cx ; to store uiTextureIndex
- %undef SUM
- %undef SUM_CUR
- %undef SQR
- %undef SQR_CUR
- %undef PUSH_SIZE
+ %undef SUM
+ %undef SUM_CUR
+ %undef SQR
+ %undef SQR_CUR
+ %undef PUSH_SIZE
- add esp, 16
- pop ebx
- pop edi
- pop esi
+ add esp, 16
+ pop ebx
+ pop edi
+ pop esi
- ret
+ ret
-; , 6/7/2010
-WELS_EXTERN abs_difference_mbrow_sse2
+WELS_EXTERN VAACalcSad_sse2
;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum)
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
;*************************************************************************************************************
+
+
ALIGN 16
-abs_difference_mbrow_sse2:
-%define ref_orig esp + pushsize + 4
-%define cur_orig esp + pushsize + 8
-%define iPicStride esp + pushsize + 12
-%define gom_pixel_num esp + pushsize + 16
-%define pSum esp + pushsize + 20
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [ref_orig]
- mov edi, [cur_orig]
- mov ebx, [iPicStride]
- mov eax, [gom_pixel_num]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0
-mb_width_loop_p:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_p:
- movdqa xmm1, [esi]
- movdqa xmm2, [edi]
- psadbw xmm1, xmm2
- paddd xmm0, xmm1
- add esi, 16
- add edi, 16
- cmp esi, edx
- jl gom_row_loop_p
+VAACalcSad_sse2:
+%define cur_data esp + pushsize + 4
+%define ref_data esp + pushsize + 8
+%define iPicWidth esp + pushsize + 12
+%define iPicHeight esp + pushsize + 16
+%define iPicStride esp + pushsize + 20
+%define psadframe esp + pushsize + 24
+%define psad8x8 esp + pushsize + 28
+%define pushsize 12
+ push esi
+ push edi
+ push ebx
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- sub esi, eax
- sub edi, eax
- add esi, ebx
- add edi, ebx
- loop mb_width_loop_p
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+height_loop:
+ mov ecx, dword [iPicWidth]
+ push esi
+ push edi
+width_loop:
+ pxor xmm6, xmm6 ;
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddd xmm1, xmm0
- movd eax, xmm1
- mov edx, [pSum] ; pSum
- add [edx], eax
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ WELS_SAD_16x2_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
-%undef ref_orig
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+ dec ecx
+ jnz width_loop
+ pop edi
+ pop esi
+ add esi, eax
+ add edi, eax
+ dec dword [iPicHeight]
+ jnz height_loop
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
-; int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ pop ebx
+ pop edi
+ pop esi
+ ret
+
+%else ;64-bit
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+; void SampleVariance16x16_sse2( uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define cur_orig esp + pushsize + 4
-%define iPicStride esp + pushsize + 8
-%define gom_pixel_num esp + pushsize + 12
-%define pSum esp + pushsize + 16
-%define pSqrSum esp + pushsize + 20
-%define pushsize 8
- push esi
- push ebx
- mov esi, [cur_orig]
- mov eax, [gom_pixel_num]
- mov ebx, [iPicStride]
- mov ecx, 16 ;MB_WIDTH_LUMA
- pxor xmm0, xmm0 ; zero
- pxor xmm1, xmm1 ; sum
- pxor xmm2, xmm2 ; sqr sum
-mb_width_loop_i:
- mov edx, esi
- add edx, eax ; end address
-gom_row_loop_i:
- movdqa xmm3, [esi]
- movdqa xmm4, xmm3
- psadbw xmm4, xmm0
- paddd xmm1, xmm4
- movdqa xmm4, xmm3
- punpcklbw xmm4, xmm0
- punpckhbw xmm3, xmm0
- pmaddwd xmm4, xmm4
- pmaddwd xmm3, xmm3
- paddd xmm2, xmm3
- paddd xmm2, xmm4
- add esi, 16
- cmp esi, edx
- jl gom_row_loop_i
+SampleVariance16x16_sse2:
+ %define SUM r10;[esp]
+ %define SUM_CUR r11;[esp+4]
+ %define SQR r13;[esp+8]
+ %define SQR_CUR r15;[esp+12]
- sub esi, eax
- add esi, ebx
- loop mb_width_loop_i
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+ LOAD_5_PARA
+ SIGN_EXTENTION r1,r1d
+ SIGN_EXTENTION r3,r3d
- movdqa xmm3, xmm1
- psrldq xmm3, 8
- paddd xmm1, xmm3
- movd eax, xmm1
- mov edx, [pSum]
- add [edx], eax
+ mov r12,010h
+ pxor xmm7, xmm7
+ movq SUM, xmm7
+ movq SUM_CUR,xmm7
+ movq SQR,xmm7
+ movq SQR_CUR,xmm7
- movdqa xmm3, xmm2
- psrldq xmm3, 8
- paddd xmm2, xmm3
- movdqa xmm3, xmm2
- psrldq xmm3, 4
- paddd xmm2, xmm3
- movd eax, xmm2
- mov edx, [pSqrSum]
- add [edx], eax
+.hloops:
+ mov r14,0
+ movdqa xmm0, [r0] ; y_ref
+ movdqa xmm1, [r2] ; y_src
+ movdqa xmm2, xmm0 ; store first for future process
+ movdqa xmm3, xmm1
+ ; sum += diff;
+ movdqa xmm4, xmm0
+ psadbw xmm4, xmm1 ; 2 parts, [0,..,15], [64,..,79]
+ ; to be continued for sum
+ pshufd xmm5, xmm4, 0C6h ; 11000110 B
+ paddw xmm4, xmm5
+ movd r14d, xmm4
+ add SUM, r14
+ ; sqr += diff * diff;
+ pmaxub xmm0, xmm1
+ pminub xmm1, xmm2
+ psubb xmm0, xmm1 ; diff
+ SUM_SQR_SSE2 xmm1, xmm0, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm1
+ add SQR, r14
-%undef cur_orig
-%undef iPicStride
-%undef gom_pixel_num
-%undef pSum
-%undef pSqrSum
-%undef pushsize
- pop ebx
- pop esi
- ret
+ ; sum_cur += y_src[x];
+ movdqa xmm0, xmm3 ; cur_orig
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7
+ punpckhbw xmm1, xmm7
+ paddw xmm0, xmm1 ; 8x2
+ SUM_WORD_8x2_SSE2 xmm0, xmm1
+ movd r14d, xmm0
+ and r14, 0ffffh
+ add SUM_CUR, r14
+ ; sqr_cur += y_src[x] * y_src[x];
+ SUM_SQR_SSE2 xmm0, xmm3, xmm7 ; dst, pSrc, zero
+ movd r14d, xmm0
+ add SQR_CUR, r14
+ lea r0, [r0+r1]
+ lea r2, [r2+r3]
+ dec r12
+ jnz near .hloops
+ mov r0, SUM
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR
+ sar r1, 8
+ sub r1, r0
+ mov [r4], r1w ; to store uiMotionIndex
+ mov r0, SUM_CUR
+ sar r0, 8
+ imul r0, r0
+ mov r1, SQR_CUR
+ sar r1, 8
+ sub r1, r0
+ mov [r4+2], r1w ; to store uiTextureIndex
+
+ LOAD_5_PARA_POP
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+
+
+ %assign push_num 0
+
+ ret
+
+
WELS_EXTERN VAACalcSad_sse2
;*************************************************************************************************************
;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSad_sse2:
-%define cur_data esp + pushsize + 4
-%define ref_data esp + pushsize + 8
-%define iPicWidth esp + pushsize + 12
-%define iPicHeight esp + pushsize + 16
-%define iPicStride esp + pushsize + 20
-%define psadframe esp + pushsize + 24
-%define psad8x8 esp + pushsize + 28
-%define pushsize 12
- push esi
- push edi
- push ebx
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+%define cur_data r0
+%define ref_data r1
+%define iPicWidth r2
+%define iPicHeight r3
+%define iPicStride r4
+%define psadframe r5
+%define psad8x8 r6
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ push r12
+ push r13
+ %assign push_num 2
+ LOAD_7_PARA
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+ SIGN_EXTENTION r4,r4d
+
+ mov r12,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
+
+ shl r12, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
height_loop:
- mov ecx, dword [iPicWidth]
- push esi
- push edi
+ mov r13, r2
+ push r0
+ push r1
width_loop:
- pxor xmm6, xmm6 ;
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6], xmm6
+ psrldq xmm6, 8
+ movd [r6+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- WELS_SAD_16x2_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ WELS_SAD_16x2_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r6+8], xmm6
+ psrldq xmm6, 8
+ movd [r6+12], xmm6
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ add r6, 16
+ sub r0, r12
+ sub r1, r12
+ add r0, 16
+ add r1, 16
- dec ecx
- jnz width_loop
+ dec r13
+ jnz width_loop
- pop edi
- pop esi
- add esi, eax
- add edi, eax
+ pop r1
+ pop r0
+ add r0, r12
+ add r1, r12
- dec dword [iPicHeight]
- jnz height_loop
+ dec r3
+ jnz height_loop
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
+ ;mov r13, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [psadframe], xmm7
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef pushsize
- pop ebx
- pop edi
- pop esi
- ret
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef pushsize
+ LOAD_7_PARA_POP
+ pop r13
+ pop r12
+ %assign push_num 0
+ ret
+%endif
+
+%ifdef X86_32
WELS_EXTERN VAACalcSadVar_sse2
;*************************************************************************************************************
;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
;*************************************************************************************************************
ALIGN 16
VAACalcSadVar_sse2:
-%define localsize 8
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+%define localsize 8
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- pxor xmm7, xmm7 ; iFrameSad
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
var_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
var_width_loop:
- pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
- pxor xmm5, xmm5 ; pSum16x16
- pxor xmm4, xmm4 ; sqsum_16x16
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx], xmm6
- psrldq xmm6, 8
- movd [edx+4], xmm6
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx], xmm6
+ psrldq xmm6, 8
+ movd [edx+4], xmm6
- pxor xmm6, xmm6
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- WELS_SAD_SUM_SQSUM_16x1_SSE2
- paddd xmm7, xmm6
- movd [edx+8], xmm6
- psrldq xmm6, 8
- movd [edx+12], xmm6
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 esi,edi,ebx
+ paddd xmm7, xmm6
+ movd [edx+8], xmm6
+ psrldq xmm6, 8
+ movd [edx+12], xmm6
- mov ebp, [psum16x16]
- movdqa xmm1, xmm5
- psrldq xmm1, 8
- paddd xmm5, xmm1
- movd [ebp], xmm5
- add dword [psum16x16], 4
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [ebp], xmm5
+ add dword [psum16x16], 4
- movdqa xmm5, xmm4
- psrldq xmm5, 8
- paddd xmm4, xmm5
- movdqa xmm3, xmm4
- psrldq xmm3, 4
- paddd xmm4, xmm3
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov ebp, [psqsum16x16]
+ movd [ebp], xmm4
+ add dword [psqsum16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz var_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz var_height_loop
+
+ mov edx, [psadframe]
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [edx], xmm7
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+%else ;64-bit
+
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define cur_data arg1 ;r0
+%define ref_data arg2 ;r1
+%define iPicWidth arg3 ;r2
+%define iPicHeight arg4 ;r3
+%define iPicStride arg5
+%define psadframe arg6
+%define psad8x8 arg7
+%define psum16x16 arg8
+%define psqsum16x16 arg9
+
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+
+%ifdef WIN64
+ mov r4, arg5 ;iPicStride
+ mov r5, arg6 ;psad8x8
+%endif
+ mov r14,arg7
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+ SIGN_EXTENTION r4,r4d
+
+ mov r13,r4
+ shr r2,4
+ shr r3,4
+
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm7, xmm7 ; iFrameSad
+var_height_loop:
+ push r2
+ %assign push_num push_num+1
+ mov r11, r0
+ mov r12, r1
+var_width_loop:
+ pxor xmm6, xmm6 ; hiQuad_loQuad pSad8x8
+ pxor xmm5, xmm5 ; pSum16x16
+ pxor xmm4, xmm4 ; sqsum_16x16
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14], xmm6
+ psrldq xmm6, 8
+ movd [r14+4], xmm6
+
+ pxor xmm6, xmm6
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_16x1_SSE2 r0,r1,r4
+ paddd xmm7, xmm6
+ movd [r14+8], xmm6
+ psrldq xmm6, 8
+ movd [r14+12], xmm6
+
+ mov r15, psum16x16
+ movdqa xmm1, xmm5
+ psrldq xmm1, 8
+ paddd xmm5, xmm1
+ movd [r15], xmm5
+ add dword psum16x16, 4
+
+ movdqa xmm5, xmm4
+ psrldq xmm5, 8
+ paddd xmm4, xmm5
+ movdqa xmm3, xmm4
+ psrldq xmm3, 4
+ paddd xmm4, xmm3
+
+ mov r15, psqsum16x16
+ movd [r15], xmm4
+ add dword psqsum16x16, 4
+
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
+
+ dec r2
+ jnz var_width_loop
+
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r11
+ mov r1, r12
+ add r0, r13
+ add r1, r13
+ dec r3
+ jnz var_height_loop
+
+ mov r15, psadframe
+ movdqa xmm5, xmm7
+ psrldq xmm7, 8
+ paddd xmm7, xmm5
+ movd [r15], xmm7
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%assign push_num 0
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
- mov ebp, [psqsum16x16]
- movd [ebp], xmm4
- add dword [psqsum16x16], 4
+%endif
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+%ifdef X86_32
- dec ecx
- jnz var_width_loop
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
- dec dword [iPicHeight]
- jnz var_height_loop
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
- mov edx, [psadframe]
- movdqa xmm5, xmm7
- psrldq xmm7, 8
- paddd xmm7, xmm5
- movd [edx], xmm7
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov edx, [psad8x8]
+ mov eax, ebx
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ mov ecx, [iPicWidth]
+ mov ecx, [iPicHeight]
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_width_loop:
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+4], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 esi,edi,ebx
+ movdqa xmm1, xmm7
+ movd [edx+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [edx+12], xmm7
+ movd ebp, xmm1
+ add [tmp_sadframe], ebp
+ mov ebp, [psum16x16]
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [ebp], xmm6
+ add dword [psum16x16], 4
+ mov ebp, [psqsum16x16]
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [ebp], xmm5
+ add dword [psqsum16x16], 4
+
+ mov ebp, [psqdiff16x16]
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [ebp], xmm4
+ add dword [psqdiff16x16], 4
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ dec ecx
+ jnz sqdiff_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_height_loop
+
+ mov ebx, [tmp_sadframe]
+ mov eax, [psadframe]
+ mov [eax], ebx
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
+
+%else
+
+
WELS_EXTERN VAACalcSadSsd_sse2
;*************************************************************************************************************
;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+; int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
;*************************************************************************************************************
ALIGN 16
VAACalcSadSsd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov edx, [psad8x8]
- mov eax, ebx
+%define localsize 12
+%define cur_data arg1;r0
+%define ref_data arg2;r1
+%define iPicWidth arg3;r2
+%define iPicHeight arg4;r3
+%define iPicStride arg5;
+%define psadframe arg6;
+%define psad8x8 arg7;
+%define psum16x16 arg8;
+%define psqsum16x16 arg9;
+%define psqdiff16x16 arg10
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- mov ecx, [iPicWidth]
- mov ecx, [iPicHeight]
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
+ push r12
+ push r13
+ push r14
+ push r15
+ %assign push_num 4
+
+%ifdef WIN64
+ mov r4,arg5
+%endif
+ mov r14,arg7
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+ SIGN_EXTENTION r4,r4d
+
+ mov r13,r4
+ shr r2,4 ; iPicWidth/16
+ shr r3,4 ; iPicHeight/16
+ shl r13,4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8 ;framesad
+ pxor xmm9, xmm9
sqdiff_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ ;mov ecx, dword [iPicWidth]
+ ;mov r14,r2
+ push r2
+ %assign push_num push_num +1
+ mov r10, r0
+ mov r11, r1
sqdiff_width_loop:
- pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
- pxor xmm6, xmm6 ; pSum16x16
- pxor xmm5, xmm5 ; sqsum_16x16 four dword
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+4], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
+ pxor xmm7, xmm7 ; hiQuad_loQuad pSad8x8
+ pxor xmm6, xmm6 ; pSum16x16
+ pxor xmm5, xmm5 ; sqsum_16x16 four dword
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+4], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
- pxor xmm7, xmm7
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
- movdqa xmm1, xmm7
- movd [edx+8], xmm7
- psrldq xmm7, 8
- paddd xmm1, xmm7
- movd [edx+12], xmm7
- movd ebp, xmm1
- add [tmp_sadframe], ebp
- mov ebp, [psum16x16]
- movdqa xmm1, xmm6
- psrldq xmm1, 8
- paddd xmm6, xmm1
- movd [ebp], xmm6
- add dword [psum16x16], 4
+ pxor xmm7, xmm7
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 r0,r1,r4
+ movdqa xmm1, xmm7
+ movd [r14+8], xmm7
+ psrldq xmm7, 8
+ paddd xmm1, xmm7
+ movd [r14+12], xmm7
+ movd r15d, xmm1
+ movd xmm9, r15d
+ paddd xmm8,xmm9
- mov ebp, [psqsum16x16]
- pshufd xmm6, xmm5, 14 ;00001110
- paddd xmm6, xmm5
- pshufd xmm5, xmm6, 1 ;00000001
- paddd xmm5, xmm6
- movd [ebp], xmm5
- add dword [psqsum16x16], 4
+ mov r15, psum16x16
+ movdqa xmm1, xmm6
+ psrldq xmm1, 8
+ paddd xmm6, xmm1
+ movd [r15], xmm6
+ add dword psum16x16, 4
- mov ebp, [psqdiff16x16]
- pshufd xmm5, xmm4, 14 ; 00001110
- paddd xmm5, xmm4
- pshufd xmm4, xmm5, 1 ; 00000001
- paddd xmm4, xmm5
- movd [ebp], xmm4
- add dword [psqdiff16x16], 4
+ mov r15, psqsum16x16
+ pshufd xmm6, xmm5, 14 ;00001110
+ paddd xmm6, xmm5
+ pshufd xmm5, xmm6, 1 ;00000001
+ paddd xmm5, xmm6
+ movd [r15], xmm5
+ add dword psqsum16x16, 4
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ mov r15, psqdiff16x16
+ pshufd xmm5, xmm4, 14 ; 00001110
+ paddd xmm5, xmm4
+ pshufd xmm4, xmm5, 1 ; 00000001
+ paddd xmm4, xmm5
+ movd [r15], xmm4
+ add dword psqdiff16x16, 4
- dec ecx
- jnz sqdiff_width_loop
+ add r14,16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
+
+ dec r2
+ jnz sqdiff_width_loop
+
+ pop r2
+ %assign push_num push_num -1
+
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
+
+ dec r3
+ jnz sqdiff_height_loop
+
+ mov r13, psadframe
+ movd [r13], xmm8
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %assign push_num 0
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef tmp_esi
+%undef tmp_edi
+%undef tmp_sadframe
+%undef pushsize
+%undef localsize
+ ret
- dec dword [iPicHeight]
- jnz sqdiff_height_loop
- mov ebx, [tmp_sadframe]
- mov eax, [psadframe]
- mov [eax], ebx
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef tmp_esi
-%undef tmp_edi
-%undef tmp_sadframe
-%undef pushsize
-%undef localsize
- ret
+%endif
+%ifdef X86_32
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define localsize 12
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define p_sd8x8 esp + pushsize + localsize + 32
+%define p_mad8x8 esp + pushsize + localsize + 36
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_ecx esp + 8
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ xor ebp, ebp
+ pxor xmm0, xmm0
+bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
+
+
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,esi ,edi, ebx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
+
+ ;movdqa xmm1, xmm4
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm4, xmm0
+ ;punpcklwd xmm4, xmm0
+ ;movd [edx+4], xmm4
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm4
+ movd ecx, xmm4
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+
+ mov edx, [psad8x8]
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add ebp, edx ; sad frame
+
+ mov edx, [p_sd8x8]
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [edx], xmm1
+ add edx, 16
+ mov [p_sd8x8], edx
+
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz bgd_height_loop
+
+ mov edx, [psadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define localsize 16
+%define cur_data esp + pushsize + localsize + 4
+%define ref_data esp + pushsize + localsize + 8
+%define iPicWidth esp + pushsize + localsize + 12
+%define iPicHeight esp + pushsize + localsize + 16
+%define iPicStride esp + pushsize + localsize + 20
+%define psadframe esp + pushsize + localsize + 24
+%define psad8x8 esp + pushsize + localsize + 28
+%define psum16x16 esp + pushsize + localsize + 32
+%define psqsum16x16 esp + pushsize + localsize + 36
+%define psqdiff16x16 esp + pushsize + localsize + 40
+%define p_sd8x8 esp + pushsize + localsize + 44
+%define p_mad8x8 esp + pushsize + localsize + 48
+%define tmp_esi esp + 0
+%define tmp_edi esp + 4
+%define tmp_sadframe esp + 8
+%define tmp_ecx esp + 12
+%define pushsize 16
+ push ebp
+ push esi
+ push edi
+ push ebx
+ sub esp, localsize
+ mov esi, [cur_data]
+ mov edi, [ref_data]
+ mov ebx, [iPicStride]
+ mov eax, ebx
+
+ shr dword [iPicWidth], 4 ; iPicWidth/16
+ shr dword [iPicHeight], 4 ; iPicHeight/16
+ shl eax, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ movd [tmp_sadframe], xmm0
+sqdiff_bgd_height_loop:
+ mov ecx, dword [iPicWidth]
+ mov [tmp_esi], esi
+ mov [tmp_edi], edi
+sqdiff_bgd_width_loop:
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [edx], xmm1 ; sum
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ mov [tmp_ecx], ecx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, esi , edi , ebx
+
+ mov edx, [psad8x8]
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [edx], xmm2
+ movd [edx+4], xmm1
+ add edx, 8
+ mov [psad8x8], edx ; sad8x8
+
+ paddd xmm1, xmm2
+ movd edx, xmm1
+ add [tmp_sadframe], edx ; iFrameSad
+
+ mov edx, [psum16x16]
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd ebp, xmm1 ; sum
+ add [edx], ebp
+ add edx, 4
+ mov [psum16x16], edx
+
+ mov edx, [psqsum16x16]
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [edx], xmm2 ; sqsum
+ add edx, 4
+ mov [psqsum16x16], edx
+
+ mov edx, [p_sd8x8]
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [edx], xmm1
+ add edx, 8
+ mov [p_sd8x8], edx
+
+ mov edx, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm5
+ ;movdqa xmm1, xmm5
+ ;punpcklbw xmm1, xmm0
+ ;punpcklwd xmm1, xmm0
+ ;movd [edx], xmm1
+ ;punpckhbw xmm5, xmm0
+ ;punpcklwd xmm5, xmm0
+ ;movd [edx+4], xmm5
+ ;add edx, 8
+ ;mov [p_mad8x8], edx
+ movhlps xmm1, xmm5
+ movd ecx, xmm5
+ mov [edx], cl
+ movd ecx, xmm1
+ mov [edx+1],cl
+ add edx, 2
+ mov [p_mad8x8], edx
+
+ mov edx, [psqdiff16x16]
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [edx], xmm4
+ add edx, 4
+ mov [psqdiff16x16], edx
+
+ add edx, 16
+ sub esi, eax
+ sub edi, eax
+ add esi, 16
+ add edi, 16
+
+ mov ecx, [tmp_ecx]
+ dec ecx
+ jnz sqdiff_bgd_width_loop
+
+ mov esi, [tmp_esi]
+ mov edi, [tmp_edi]
+ add esi, eax
+ add edi, eax
+
+ dec dword [iPicHeight]
+ jnz sqdiff_bgd_height_loop
+
+ mov edx, [psadframe]
+ mov ebp, [tmp_sadframe]
+ mov [edx], ebp
+
+ add esp, localsize
+ pop ebx
+ pop edi
+ pop esi
+ pop ebp
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+%else
+
WELS_EXTERN VAACalcSadBgd_sse2
;*************************************************************************************************************
;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSadBgd_sse2:
-%define localsize 12
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define p_sd8x8 esp + pushsize + localsize + 32
-%define p_mad8x8 esp + pushsize + localsize + 36
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_ecx esp + 8
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
+%define cur_data arg1;
+%define ref_data arg2;
+%define iPicWidth arg3;
+%define iPicHeight arg4;
+%define iPicStride arg5;
+%define psadframe arg6;
+%define psad8x8 arg7;
+%define p_sd8x8 arg8;
+%define p_mad8x8 arg9;
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- xor ebp, ebp
- pxor xmm0, xmm0
+ push r12
+ push r13
+ push r14
+ push r15
+%assign push_num 4
+%ifdef WIN64
+ mov r4,arg5
+ ; mov r5,arg6
+%endif
+ mov r14,arg7
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+ SIGN_EXTENTION r4,r4d
+
+
+ mov r13,r4
+ mov r15,r0
+ shr r2,4
+ shr r3,4
+ shl r13,4
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ ;mov ecx, dword [iPicWidth]
+ push r2
+ %assign push_num push_num+1
+ mov r10, r15
+ mov r11, r1
bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8
- pxor xmm6, xmm6 ; sum_cur_8x8
- pxor xmm5, xmm5 ; sum_ref_8x8
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ pxor xmm7, xmm7 ; pSad8x8
+ pxor xmm6, xmm6 ; sum_cur_8x8
+ pxor xmm5, xmm5 ; sum_ref_8x8
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm4
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ ;mov [tmp_ecx], ecx
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
- pslldq xmm7, 4
- pslldq xmm6, 4
- pslldq xmm5, 4
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ ;mov p_mad8x8, r14
- pxor xmm4, xmm4 ; pMad8x8
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ pslldq xmm7, 4
+ pslldq xmm6, 4
+ pslldq xmm5, 4
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm4
- ;movdqa xmm1, xmm4
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm4, xmm0
- ;punpcklwd xmm4, xmm0
- ;movd [edx+4], xmm4
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm4
- movd ecx, xmm4
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ pxor xmm4, xmm4 ; pMad8x8
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
+ WELS_SAD_SD_MAD_16x1_SSE2 xmm7, xmm6, xmm5, xmm4 ,r15 ,r1, r4
- ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
+ ;mov r14, [p_mad8x8]
+ WELS_MAX_REG_SSE2 xmm4
- mov edx, [psad8x8]
- pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
- movdqa [edx], xmm1
- add edx, 16
- mov [psad8x8], edx ; sad8x8
+ movhlps xmm1, xmm4
+ movd r0d, xmm4
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ add r14, 2
+ mov p_mad8x8, r14
- paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
- pshufd xmm2, xmm1, 00000011b
- paddd xmm1, xmm2
- movd edx, xmm1
- add ebp, edx ; sad frame
+ ; data in xmm7, xmm6, xmm5: D1 D3 D0 D2
- mov edx, [p_sd8x8]
- psubd xmm6, xmm5
- pshufd xmm1, xmm6, 10001101b
- movdqa [edx], xmm1
- add edx, 16
- mov [p_sd8x8], edx
+ mov r14, psad8x8
+ pshufd xmm1, xmm7, 10001101b ; D3 D2 D1 D0
+ movdqa [r14], xmm1
+ add r14, 16
+ mov psad8x8, r14 ; sad8x8
+ paddd xmm1, xmm7 ; D1+3 D3+2 D0+1 D2+0
+ pshufd xmm2, xmm1, 00000011b
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; sad frame
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ mov r14, p_sd8x8
+ psubd xmm6, xmm5
+ pshufd xmm1, xmm6, 10001101b
+ movdqa [r14], xmm1
+ add r14, 16
+ mov p_sd8x8, r14
- mov ecx, [tmp_ecx]
- dec ecx
- jnz bgd_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ ;add edx, 16
+ sub r15, r13
+ sub r1, r13
+ add r15, 16
+ add r1, 16
- dec dword [iPicHeight]
- jnz bgd_height_loop
- mov edx, [psadframe]
- mov [edx], ebp
+ dec r2
+ jnz bgd_width_loop
+ pop r2
+%assign push_num push_num-1
+ mov r15, r10
+ mov r1, r11
+ add r15, r13
+ add r1, r13
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
+ dec r3
+ jnz bgd_height_loop
+ mov r13, psadframe
+ movd [r13], xmm8
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%assign push_num 0
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
+
+
WELS_EXTERN VAACalcSadSsdBgd_sse2
;*************************************************************************************************************
;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
-; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
-; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+; int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
+; int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
;*************************************************************************************************************
ALIGN 16
VAACalcSadSsdBgd_sse2:
-%define localsize 16
-%define cur_data esp + pushsize + localsize + 4
-%define ref_data esp + pushsize + localsize + 8
-%define iPicWidth esp + pushsize + localsize + 12
-%define iPicHeight esp + pushsize + localsize + 16
-%define iPicStride esp + pushsize + localsize + 20
-%define psadframe esp + pushsize + localsize + 24
-%define psad8x8 esp + pushsize + localsize + 28
-%define psum16x16 esp + pushsize + localsize + 32
-%define psqsum16x16 esp + pushsize + localsize + 36
-%define psqdiff16x16 esp + pushsize + localsize + 40
-%define p_sd8x8 esp + pushsize + localsize + 44
-%define p_mad8x8 esp + pushsize + localsize + 48
-%define tmp_esi esp + 0
-%define tmp_edi esp + 4
-%define tmp_sadframe esp + 8
-%define tmp_ecx esp + 12
-%define pushsize 16
- push ebp
- push esi
- push edi
- push ebx
- sub esp, localsize
- mov esi, [cur_data]
- mov edi, [ref_data]
- mov ebx, [iPicStride]
- mov eax, ebx
+%define cur_data arg1;
+%define ref_data arg2;
+%define iPicWidth arg3;
+%define iPicHeight arg4;
+%define iPicStride arg5;
+%define psadframe arg6;
+%define psad8x8 arg7;
+%define psum16x16 arg8;
+%define psqsum16x16 arg9;
+%define psqdiff16x16 arg10;
+%ifdef WIN64
+%define p_sd8x8 [rsp + push_num*8 + 88];
+%define p_mad8x8 [rsp + push_num*8 + 96];
+%else ;linux
+%define p_sd8x8 [rsp + push_num*8 + 40];
+%define p_mad8x8 [rsp + push_num*8 + 48];
+%endif
- shr dword [iPicWidth], 4 ; iPicWidth/16
- shr dword [iPicHeight], 4 ; iPicHeight/16
- shl eax, 4 ; iPicStride*16
- pxor xmm0, xmm0
- movd [tmp_sadframe], xmm0
+ push r12
+ push r13
+ push r14
+ push r15
+%assign push_num 4
+%ifdef WIN64
+ mov r4,arg5
+ ;mov r5,arg6
+%endif
+ SIGN_EXTENTION r2,r2d
+ SIGN_EXTENTION r3,r3d
+ SIGN_EXTENTION r4,r4d
+
+ mov r13,r4
+ shr r2, 4 ; iPicWidth/16
+ shr r3, 4 ; iPicHeight/16
+ shl r13, 4 ; iPicStride*16
+ pxor xmm0, xmm0
+ pxor xmm8, xmm8
+ pxor xmm9, xmm9
+
+
sqdiff_bgd_height_loop:
- mov ecx, dword [iPicWidth]
- mov [tmp_esi], esi
- mov [tmp_edi], edi
+ mov r10, r0
+ mov r11, r1
+ push r2
+%assign push_num push_num+1
sqdiff_bgd_width_loop:
- pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
+ pxor xmm7, xmm7 ; pSad8x8 interleaves sqsum16x16: sqsum1 sad1 sqsum0 sad0
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ pxor xmm4, xmm4 ; sqdiff_16x16 four Dword
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd [edx], xmm1 ; sum
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9,r14d
+ paddd xmm8, xmm9 ; iFrameSad
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd [r14], xmm1 ; sum
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- mov [tmp_ecx], ecx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
- psrlq xmm7, 32
- psllq xmm7, 32 ; clear sad
- pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
- pxor xmm5, xmm5 ; pMad8x8
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
- WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
- mov edx, [psad8x8]
- movdqa xmm2, xmm7
- pshufd xmm1, xmm2, 00001110b
- movd [edx], xmm2
- movd [edx+4], xmm1
- add edx, 8
- mov [psad8x8], edx ; sad8x8
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
- paddd xmm1, xmm2
- movd edx, xmm1
- add [tmp_sadframe], edx ; iFrameSad
+ psrlq xmm7, 32
+ psllq xmm7, 32 ; clear sad
+ pxor xmm6, xmm6 ; sum_8x8 interleaves cur and pRef in Dword, Sref1 Scur1 Sref0 Scur0
+ pxor xmm5, xmm5 ; pMad8x8
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
+ WELS_SAD_BGD_SQDIFF_16x1_SSE2 xmm7, xmm6, xmm5, xmm4, r0 , r1 , r4
- mov edx, [psum16x16]
- movdqa xmm1, xmm6
- pshufd xmm2, xmm1, 00001110b
- paddd xmm1, xmm2
- movd ebp, xmm1 ; sum
- add [edx], ebp
- add edx, 4
- mov [psum16x16], edx
+ mov r14, psad8x8
+ movdqa xmm2, xmm7
+ pshufd xmm1, xmm2, 00001110b
+ movd [r14], xmm2
+ movd [r14+4], xmm1
+ add r14, 8
+ mov psad8x8, r14 ; sad8x8
- mov edx, [psqsum16x16]
- psrlq xmm7, 32
- pshufd xmm2, xmm7, 00001110b
- paddd xmm2, xmm7
- movd [edx], xmm2 ; sqsum
- add edx, 4
- mov [psqsum16x16], edx
+ paddd xmm1, xmm2
+ movd r14d, xmm1
+ movd xmm9, r14d
+ paddd xmm8, xmm9 ; iFrameSad
- mov edx, [p_sd8x8]
- pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
- psubd xmm6, xmm1 ; 00 diff1 00 diff0
- pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
- movq [edx], xmm1
- add edx, 8
- mov [p_sd8x8], edx
+ mov r14, psum16x16
+ movdqa xmm1, xmm6
+ pshufd xmm2, xmm1, 00001110b
+ paddd xmm1, xmm2
+ movd r15d, xmm1 ; sum
+ add [r14], r15d
+ add r14, 4
+ mov psum16x16, r14
- mov edx, [p_mad8x8]
- WELS_MAX_REG_SSE2 xmm5
- ;movdqa xmm1, xmm5
- ;punpcklbw xmm1, xmm0
- ;punpcklwd xmm1, xmm0
- ;movd [edx], xmm1
- ;punpckhbw xmm5, xmm0
- ;punpcklwd xmm5, xmm0
- ;movd [edx+4], xmm5
- ;add edx, 8
- ;mov [p_mad8x8], edx
- movhlps xmm1, xmm5
- movd ecx, xmm5
- mov [edx], cl
- movd ecx, xmm1
- mov [edx+1],cl
- add edx, 2
- mov [p_mad8x8], edx
+ mov r14, psqsum16x16
+ psrlq xmm7, 32
+ pshufd xmm2, xmm7, 00001110b
+ paddd xmm2, xmm7
+ movd [r14], xmm2 ; sqsum
+ add r14, 4
+ mov psqsum16x16, r14
- mov edx, [psqdiff16x16]
- pshufd xmm1, xmm4, 00001110b
- paddd xmm4, xmm1
- pshufd xmm1, xmm4, 00000001b
- paddd xmm4, xmm1
- movd [edx], xmm4
- add edx, 4
- mov [psqdiff16x16], edx
+ mov r14, p_sd8x8
+ pshufd xmm1, xmm6, 11110101b ; Sref1 Sref1 Sref0 Sref0
+ psubd xmm6, xmm1 ; 00 diff1 00 diff0
+ pshufd xmm1, xmm6, 00001000b ; xx xx diff1 diff0
+ movq [r14], xmm1
+ add r14, 8
+ mov p_sd8x8, r14
- add edx, 16
- sub esi, eax
- sub edi, eax
- add esi, 16
- add edi, 16
+ mov r14, p_mad8x8
+ WELS_MAX_REG_SSE2 xmm5
- mov ecx, [tmp_ecx]
- dec ecx
- jnz sqdiff_bgd_width_loop
- mov esi, [tmp_esi]
- mov edi, [tmp_edi]
- add esi, eax
- add edi, eax
+ movhlps xmm1, xmm5
+ push r0
+ movd r0d, xmm5
+ mov [r14], r0b
+ movd r0d, xmm1
+ mov [r14+1],r0b
+ pop r0
+ add r14, 2
+ mov p_mad8x8, r14
- dec dword [iPicHeight]
- jnz sqdiff_bgd_height_loop
+ mov r14, psqdiff16x16
+ pshufd xmm1, xmm4, 00001110b
+ paddd xmm4, xmm1
+ pshufd xmm1, xmm4, 00000001b
+ paddd xmm4, xmm1
+ movd [r14], xmm4
+ add r14, 4
+ mov psqdiff16x16, r14
- mov edx, [psadframe]
- mov ebp, [tmp_sadframe]
- mov [edx], ebp
+ add r14, 16
+ sub r0, r13
+ sub r1, r13
+ add r0, 16
+ add r1, 16
- add esp, localsize
- pop ebx
- pop edi
- pop esi
- pop ebp
-%undef cur_data
-%undef ref_data
-%undef iPicWidth
-%undef iPicHeight
-%undef iPicStride
-%undef psadframe
-%undef psad8x8
-%undef psum16x16
-%undef psqsum16x16
-%undef psqdiff16x16
-%undef p_sd8x8
-%undef p_mad8x8
-%undef tmp_esi
-%undef tmp_edi
-%undef pushsize
-%undef localsize
- ret
+ dec r2
+ jnz sqdiff_bgd_width_loop
+ pop r2
+ %assign push_num push_num-1
+ mov r0, r10
+ mov r1, r11
+ add r0, r13
+ add r1, r13
+
+ dec r3
+ jnz sqdiff_bgd_height_loop
+
+ mov r14, psadframe
+ movd [r14], xmm8
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%assign push_num 0
+%undef cur_data
+%undef ref_data
+%undef iPicWidth
+%undef iPicHeight
+%undef iPicStride
+%undef psadframe
+%undef psad8x8
+%undef psum16x16
+%undef psqsum16x16
+%undef psqdiff16x16
+%undef p_sd8x8
+%undef p_mad8x8
+%undef tmp_esi
+%undef tmp_edi
+%undef pushsize
+%undef localsize
+ ret
%endif
--- a/codec/processing/src/common/cpu.cpp
+++ /dev/null
@@ -1,196 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file cpu.c
- *
- * \brief CPU compatibility detection
- *
- * \date 04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#include "util.h"
-#include "cpu.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-#define CPU_Vender_AMD "AuthenticAMD"
-#define CPU_Vender_INTEL "GenuineIntel"
-#define CPU_Vender_CYRIX "CyrixInstead"
-
-
-#if defined(X86_ASM)
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors) {
- uint32_t uiCPU = 0;
- uint32_t uiFeatureA = 0, uiFeatureB = 0, uiFeatureC = 0, uiFeatureD = 0;
- int32_t CacheLineSize = 0;
- int8_t chVenderName[16] = { 0 };
-
- if (!WelsCPUIdVerify()) {
- /* cpuid is not supported in cpu */
- return 0;
- }
-
- WelsCPUId (0, &uiFeatureA, (uint32_t*)&chVenderName[0], (uint32_t*)&chVenderName[8], (uint32_t*)&chVenderName[4]);
- if (uiFeatureA == 0) {
- /* maximum input value for basic cpuid information */
- return 0;
- }
-
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if ((uiFeatureD & 0x00800000) == 0) {
- /* Basic MMX technology is not support in cpu, mean nothing for us so return here */
- return 0;
- }
-
- uiCPU = WELS_CPU_MMX;
- if (uiFeatureD & 0x02000000) {
- /* SSE technology is identical to AMD MMX extensions */
- uiCPU |= WELS_CPU_MMXEXT | WELS_CPU_SSE;
- }
- if (uiFeatureD & 0x04000000) {
- /* SSE2 support here */
- uiCPU |= WELS_CPU_SSE2;
- }
- if (uiFeatureD & 0x00000001) {
- /* x87 FPU on-chip checking */
- uiCPU |= WELS_CPU_FPU;
- }
- if (uiFeatureD & 0x00008000) {
- /* CMOV instruction checking */
- uiCPU |= WELS_CPU_CMOV;
- }
- if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) { // confirmed_safe_unsafe_usage
- if (uiFeatureD & 0x10000000) {
- /* Multi-Threading checking: contains of multiple logic processors */
- uiCPU |= WELS_CPU_HTT;
- }
- }
-
- if (uiFeatureC & 0x00000001) {
- /* SSE3 support here */
- uiCPU |= WELS_CPU_SSE3;
- }
- if (uiFeatureC & 0x00000200) {
- /* SSSE3 support here */
- uiCPU |= WELS_CPU_SSSE3;
- }
- if (uiFeatureC & 0x00080000) {
- /* SSE4.1 support here, 45nm Penryn processor */
- uiCPU |= WELS_CPU_SSE41;
- }
- if (uiFeatureC & 0x00100000) {
- /* SSE4.2 support here, next generation Nehalem processor */
- uiCPU |= WELS_CPU_SSE42;
- }
- if (WelsCPUSupportAVX (uiFeatureA, uiFeatureC)) { //
- /* AVX supported */
- uiCPU |= WELS_CPU_AVX;
- }
- if (WelsCPUSupportFMA (uiFeatureA, uiFeatureC)) { //
- /* AVX FMA supported */
- uiCPU |= WELS_CPU_FMA;
- }
- if (uiFeatureC & 0x02000000) {
- /* AES checking */
- uiCPU |= WELS_CPU_AES;
- }
- if (uiFeatureC & 0x00400000) {
- /* MOVBE checking */
- uiCPU |= WELS_CPU_MOVBE;
- }
-
- if (pNumberOfLogicProcessors != NULL) {
- // HTT enabled on chip
- *pNumberOfLogicProcessors = (uiFeatureB & 0x00ff0000) >> 16; // feature bits: 23-16 on returned EBX
- }
-
- WelsCPUId (0x80000000, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
- if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_AMD))
- && (uiFeatureA >= 0x80000001)) { // confirmed_safe_unsafe_usage
- WelsCPUId (0x80000001, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- if (uiFeatureD & 0x00400000) {
- uiCPU |= WELS_CPU_MMXEXT;
- }
- if (uiFeatureD & 0x80000000) {
- uiCPU |= WELS_CPU_3DNOW;
- }
- }
-
- if (!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL)) { // confirmed_safe_unsafe_usage
- int32_t family, model;
-
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
- family = ((uiFeatureA >> 8) & 0xf) + ((uiFeatureA >> 20) & 0xff);
- model = ((uiFeatureA >> 4) & 0xf) + ((uiFeatureA >> 12) & 0xf0);
-
- if ((family == 6) && (model == 9 || model == 13 || model == 14)) {
- uiCPU &= ~ (WELS_CPU_SSE2 | WELS_CPU_SSE3);
- }
- }
-
- // get cache line size
- if ((!strcmp ((const str_t*)chVenderName, CPU_Vender_INTEL))
- || ! (strcmp ((const str_t*)chVenderName, CPU_Vender_CYRIX))) { // confirmed_safe_unsafe_usage
- WelsCPUId (1, &uiFeatureA, &uiFeatureB, &uiFeatureC, &uiFeatureD);
-
- CacheLineSize = (uiFeatureB & 0xff00) >>
- 5; // ((clflush_line_size >> 8) << 3), CLFLUSH_line_size * 8 = CacheLineSize_in_byte
-
- if (CacheLineSize == 128) {
- uiCPU |= WELS_CPU_CACHELINE_128;
- } else if (CacheLineSize == 64) {
- uiCPU |= WELS_CPU_CACHELINE_64;
- } else if (CacheLineSize == 32) {
- uiCPU |= WELS_CPU_CACHELINE_32;
- } else if (CacheLineSize == 16) {
- uiCPU |= WELS_CPU_CACHELINE_16;
- }
- }
-
- return uiCPU;
-}
-
-
-void WelsCPURestore (const uint32_t kuiCPU) {
- if (kuiCPU & (WELS_CPU_MMX | WELS_CPU_MMXEXT | WELS_CPU_3DNOW | WELS_CPU_3DNOWEXT)) {
- WelsEmms();
- }
-}
-
-#endif
-
-
-WELSVP_NAMESPACE_END
-
-
--- a/codec/processing/src/common/cpu.h
+++ /dev/null
@@ -1,102 +1,0 @@
-/*!
- * \copy
- * Copyright (c) 2009-2013, Cisco Systems
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- * \file cpu.h
- *
- * \brief CPU feature compatibility detection
- *
- * \date 04/29/2009 Created
- *
- *************************************************************************************
- */
-
-#ifndef WELSVP_CPU_H
-#define WELSVP_CPU_H
-
-#include "typedef.h"
-
-WELSVP_NAMESPACE_BEGIN
-
-/*
- * WELS CPU feature flags
- */
-#define WELS_CPU_MMX 0x00000001 /* mmx */
-#define WELS_CPU_MMXEXT 0x00000002 /* mmx-ext*/
-#define WELS_CPU_SSE 0x00000004 /* sse */
-#define WELS_CPU_SSE2 0x00000008 /* sse 2 */
-#define WELS_CPU_SSE3 0x00000010 /* sse 3 */
-#define WELS_CPU_SSE41 0x00000020 /* sse 4.1 */
-#define WELS_CPU_3DNOW 0x00000040 /* 3dnow! */
-#define WELS_CPU_3DNOWEXT 0x00000080 /* 3dnow! ext */
-#define WELS_CPU_ALTIVEC 0x00000100 /* altivec */
-#define WELS_CPU_SSSE3 0x00000200 /* ssse3 */
-#define WELS_CPU_SSE42 0x00000400 /* sse 4.2 */
-
-/* CPU features application extensive */
-#define WELS_CPU_AVX 0x00000800 /* Advanced Vector eXtentions */
-#define WELS_CPU_FPU 0x00001000 /* x87-FPU on chip */
-#define WELS_CPU_HTT 0x00002000 /* Hyper-Threading Technology (HTT), Multi-threading enabled feature:
- physical processor package is capable of supporting more than one logic processor
- */
-#define WELS_CPU_CMOV 0x00004000 /* Conditional Move Instructions,
- also if x87-FPU is present at indicated by the CPUID.FPU feature bit, then FCOMI and FCMOV are supported
- */
-#define WELS_CPU_MOVBE 0x00008000 /* MOVBE instruction */
-#define WELS_CPU_AES 0x00010000 /* AES instruction extensions */
-#define WELS_CPU_FMA 0x00020000 /* AVX VEX FMA instruction sets */
-
-#define WELS_CPU_CACHELINE_16 0x10000000 /* CacheLine Size 16 */
-#define WELS_CPU_CACHELINE_32 0x20000000 /* CacheLine Size 32 */
-#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
-#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
-
-/*
- * Interfaces for CPU core feature detection as below
- */
-
-#ifdef X86_ASM
-WELSVP_EXTERN_C_BEGIN
-
-int32_t WelsCPUIdVerify();
-
-void WelsCPUId (uint32_t uiIndex, uint32_t* pFeatureA, uint32_t* pFeatureB, uint32_t* pFeatureC, uint32_t* pFeatureD);
-int32_t WelsCPUSupportAVX (uint32_t eax, uint32_t ecx);
-int32_t WelsCPUSupportFMA (uint32_t eax, uint32_t ecx);
-
-void WelsEmms();
-
-WELSVP_EXTERN_C_END
-#endif
-
-uint32_t WelsCPUFeatureDetect (int32_t* pNumberOfLogicProcessors);
-
-WELSVP_NAMESPACE_END
-
-#endif
--- a/codec/processing/src/vaacalc/vaacalculation.cpp
+++ b/codec/processing/src/vaacalc/vaacalculation.cpp
@@ -58,11 +58,11 @@
sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_c;
#ifdef X86_ASM
if ((iCpuFlag & WELS_CPU_SSE2) == WELS_CPU_SSE2) {
- /* sVaaFuncs.pfVAACalcSad = VAACalcSad_sse2;
- sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_sse2;
- sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_sse2;
- sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
- sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;*/
+ sVaaFuncs.pfVAACalcSad = VAACalcSad_sse2;
+ sVaaFuncs.pfVAACalcSadBgd = VAACalcSadBgd_sse2;
+ sVaaFuncs.pfVAACalcSadSsd = VAACalcSadSsd_sse2;
+ sVaaFuncs.pfVAACalcSadSsdBgd = VAACalcSadSsdBgd_sse2;
+ sVaaFuncs.pfVAACalcSadVar = VAACalcSadVar_sse2;
}
#endif//X86_ASM
}
--- a/codec/processing/targets.mk
+++ b/codec/processing/targets.mk
@@ -3,7 +3,6 @@
PROCESSING_CPP_SRCS=\
$(PROCESSING_SRCDIR)/./src/adaptivequantization/AdaptiveQuantization.cpp\
$(PROCESSING_SRCDIR)/./src/backgounddetection/BackgroundDetection.cpp\
- $(PROCESSING_SRCDIR)/./src/common/cpu.cpp\
$(PROCESSING_SRCDIR)/./src/common/memory.cpp\
$(PROCESSING_SRCDIR)/./src/common/thread.cpp\
$(PROCESSING_SRCDIR)/./src/common/util.cpp\