ref: 5c60e8f868eacd4747239dca5518e7761f2e570f
parent: 94cabe10d54021c8269d51ba3fa5d88c4a0607fe
author: Licai Guo <[email protected]>
date: Fri Mar 28 06:22:11 EDT 2014
Add ASM related functions for ME cross search Add asm level functions Add asm code for ME Modify format Add unit test for asm code. Modify function name and format. Remove unuse comment Modify targets file Add Macro protect for SSE41 funtion test Modify according to review request.
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -81,6 +81,7 @@
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
+ 4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -281,6 +282,7 @@
4CE446A918BC605C0017DF25 /* inc */ = {
isa = PBXGroup;
children = (
+ 4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */,
4CE446AA18BC605C0017DF25 /* as264_common.h */,
4CE446AB18BC605C0017DF25 /* au_set.h */,
4CE446AC18BC605C0017DF25 /* bit_stream.h */,
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -199,11 +199,24 @@
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
+#ifdef X86_ASM
+extern "C"
+{
+uint32_t SampleSad8x8Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+uint32_t SampleSad16x16Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+}
+
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+ uint16_t* pMvdTable, const int32_t kiFixedMvd,
+ const int32_t kiEncStride, const int32_t kiRefStride,
+ const int32_t kiMinPos, const int32_t kiMaxPos,
+ const bool bVerticalSearch );
+#endif
void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList, SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);
// Feature Search Basics
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@@ -87,6 +87,7 @@
#define PARA_SET_TYPE_SUBSETSPS 1
#define PARA_SET_TYPE_PPS 2
+#define MAX_VERTICAL_MV_RANGE 1024 //TODO, for allocate enough memory for transpose
#define MAX_FRAME_RATE 30 // maximal frame rate to support
#define MIN_FRAME_RATE 1 // minimal frame rate need support
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -134,6 +134,7 @@
typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
uint8_t*, uint8_t*);
+typedef uint32_t (*PSampleSadHor8Func)( uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t* );
typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,
void* pSlice);
typedef void (*PSearchMethodFunc) (SWelsFuncPtrList* pFuncList, void* pMe, void* pSlice, const int32_t kiEncStride, const int32_t kiRefStride);
@@ -202,14 +203,16 @@
PGetIntraPredFunc pfGetLumaI4x4Pred[I4_PRED_A];
PGetIntraPredFunc pfGetChromaPred[C_PRED_A];
+ PSampleSadHor8Func pfSampleSadHor8[2]; // 0: for 16x16 square; 1: for 8x8 square
PMotionSearchFunc
pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
PSearchMethodFunc pfSearchMethod[BLOCK_SIZE_ALL];
PCalculateSatdFunc pfCalculateSatd;
PCheckDirectionalMv pfCheckDirectionalMv;
- PLineFullSearchFunc pfLineFullSearch;
PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16
PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16
+ PLineFullSearchFunc pfVerticalFullSearch;
+ PLineFullSearchFunc pfHorizontalFullSearch;
PCopyFunc pfCopy16x16Aligned; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
PCopyFunc pfCopy16x16NotAligned; //md.c
--- /dev/null
+++ b/codec/encoder/core/inc/wels_transpose_matrix.h
@@ -1,0 +1,54 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+#define WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+
+#ifdef X86_ASM
+extern "C"
+{
+void TransposeMatrixBlocksx16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock16x16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+void TransposeMatrixBlocksx8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock8x8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+}
+#endif
+
+typedef void (*PTransposeMatrixBlockFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+typedef void (*PTransposeMatrixBlocksFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+
+}// end of namespace declaration
+
+#endif//WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -41,6 +41,7 @@
#include "cpu_core.h"
#include "ls_defines.h"
#include "svc_motion_estimate.h"
+#include "wels_transpose_matrix.h"
namespace WelsSVCEnc {
@@ -65,8 +66,14 @@
pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;
//for cross serarch
- pFuncList->pfLineFullSearch = LineFullSearch_c;
+ pFuncList->pfVerticalFullSearch = LineFullSearch_c;
+ pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
+#if defined (X86_ASM)
if ( uiCpuFlag & WELS_CPU_SSE41 ) {
+ pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
+ pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
+ pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
+ pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
}
//for feature search
@@ -75,6 +82,7 @@
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#endif
}
}
@@ -302,6 +310,17 @@
/////////////////////////
// Cross Search Basics
/////////////////////////
+#if defined (X86_ASM)
+void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
+{
+ uint16_t *pBaseCost = pMvdCost;
+ const int32_t kiOffset = (kiStartMv<<2);
+ uint16_t *pMvd = pMvdTable+kiOffset;
+ for (int32_t i = 0; i < 8; ++ i) {
+ pBaseCost[i] = ((*pMvd) + kiFixedCost);
+ pMvd += 4;
+ }
+}
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
@@ -308,12 +327,130 @@
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch ) {
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
- SWelsME *pMe = static_cast<SWelsME *>(vpMe);
+ SWelsME *pMe = static_cast<SWelsME *>(vpMe);
+ uint8_t* kpEncMb = pMe->pEncMb;
+ const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY;
+ uint8_t* pRef = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
+ const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
+ const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8;
+ PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+ PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+ PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
+ PTransposeMatrixBlocksFunc TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
+
+ const int32_t kiDiff = kiMaxPos - kiMinPos;
+ const int32_t kiRowNum = WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
+ const int32_t kiBlocksNum = kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
+ int32_t iCountLoop8 = (kiRowNum-kiEdgeBlocks) >> 3;
+ const int32_t kiRemainingVectors = kiDiff - (iCountLoop8<<3);
+ const int32_t kiMatrixStride = MAX_VERTICAL_MV_RANGE;
+ ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 ); // transpose matrix result for ref
+ ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 ); // transpose matrix result for enc
+ assert(kiRowNum <= kiMatrixStride); // make sure effective memory
+
+ TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
+ TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
+ ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+ int32_t iTargetPos = kiMinPos;
+ int16_t iBestPos = pMe->sMv.iMvX;
+ uint32_t uiBestCost = pMe->uiSadCost;
+ uint32_t uiCostMin;
+ int32_t iIndexMinPos;
+ kpEncMb = &uiMatrixEnc[0][0];
+ pRef = &uiMatrixRef[0][0];
+
+ while(iCountLoop8 > 0) {
+ CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+ uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
+ if (uiCostMin < uiBestCost) {
+ uiBestCost = uiCostMin;
+ iBestPos = iTargetPos+iIndexMinPos;
+ }
+ iTargetPos += 8;
+ pRef += 8;
+ -- iCountLoop8;
+ }
+ if (kiRemainingVectors > 0) {
+ kpEncMb = pMe->pEncMb;
+ pRef = &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
+ while (iTargetPos < kiMaxPos) {
+ const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
+ uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+ if (uiSadCost < uiBestCost) {
+ uiBestCost = uiSadCost;
+ iBestPos = iTargetPos;
+ }
+ pRef += kiRefStride;
+ ++iTargetPos;
+ }
+ }
+ if (uiBestCost < pMe->uiSadCost) {
+ SMVUnitXY sBestMv;
+ sBestMv.iMvX = 0;
+ sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
+ UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );
+ }
}
-void LineFullSearch_c( void *pFunc, void *vpMe,
- uint16_t* pMvdTable, const int32_t kiFixedMvd,
- const int32_t kiEncStride, const int32_t kiRefStride,
- const int32_t kiMinPos, const int32_t kiMaxPos,
+
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+ uint16_t* pMvdTable, const int32_t kiFixedMvd,
+ const int32_t kiEncStride, const int32_t kiRefStride,
+ const int32_t kiMinPos, const int32_t kiMaxPos,
+ const bool bVerticalSearch )
+{
+ SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
+ SWelsME *pMe = static_cast<SWelsME *>(vpMe);
+ uint8_t *kpEncMb = pMe->pEncMb;
+ const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixX;
+ uint8_t *pRef = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
+ const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
+ PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+ PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+ ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+ const int32_t kiNumVector = kiMaxPos - kiMinPos;
+ int32_t iCountLoop8 = kiNumVector >> 3;
+ const int32_t kiRemainingLoop8 = kiNumVector & 7;
+ int32_t iTargetPos = kiMinPos;
+ int16_t iBestPos = pMe->sMv.iMvX;
+ uint32_t uiBestCost = pMe->uiSadCost;
+ uint32_t uiCostMin;
+ int32_t iIndexMinPos;
+
+ while(iCountLoop8 > 0) {
+ CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+ uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
+ if (uiCostMin < uiBestCost) {
+ uiBestCost = uiCostMin;
+ iBestPos = iTargetPos+iIndexMinPos;
+ }
+ iTargetPos += 8;
+ pRef += 8;
+ -- iCountLoop8;
+ }
+ if ( kiRemainingLoop8 > 0 ) {
+ while (iTargetPos < kiMaxPos) {
+ const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
+ uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+ if (uiSadCost < uiBestCost) {
+ uiBestCost = uiSadCost;
+ iBestPos = iTargetPos;
+ }
+ ++pRef;
+ ++iTargetPos;
+ }
+ }
+ if (uiBestCost < pMe->uiSadCost) {
+ SMVUnitXY sBestMv;
+ sBestMv.iMvX = iBestPos - kiCurMeBlockPix;
+ sBestMv.iMvY = 0;
+ UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );
+ }
+}
+#endif
+void LineFullSearch_c( void *pFunc, void *vpMe,
+ uint16_t* pMvdTable, const int32_t kiFixedMvd,
+ const int32_t kiEncStride, const int32_t kiRefStride,
+ const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch ) {
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
SWelsME *pMe = static_cast<SWelsME *>(vpMe);
@@ -346,8 +483,8 @@
void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList, SWelsME * pMe,
const SSlice* pSlice, const int32_t kiEncStride, const int32_t kiRefStride) {
- PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfLineFullSearch;
- PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfLineFullSearch;
+ PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch;
+ PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch;
const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
--- /dev/null
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -1,0 +1,395 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;in: m0, m1, m2, m3, m4, m5, m6, m7
+;out: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE_8x8B_MMX 10
+ MMX_XSwap bw, %1, %2, %8
+ MMX_XSwap bw, %3, %4, %2
+ MMX_XSwap bw, %5, %6, %4
+ movq %6, %9
+ movq %10, %4
+ MMX_XSwap bw, %7, %6, %4
+
+ MMX_XSwap wd, %1, %3, %6
+ MMX_XSwap wd, %8, %2, %3
+ MMX_XSwap wd, %5, %7, %2
+ movq %7, %10
+ movq %10, %3
+ MMX_XSwap wd, %7, %4, %3
+
+ MMX_XSwap dq, %1, %5, %4
+ MMX_XSwap dq, %6, %2, %5
+ MMX_XSwap dq, %8, %7, %2
+ movq %7, %10
+ movq %10, %5
+ MMX_XSwap dq, %7, %3, %5
+
+ movq %3, %10
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], mm5 ; result of line 3
+ movq [%1+%2], mm2 ; result of line 4
+ lea %1, [%1+2*%2]
+ movq [%1], mm7 ; result of line 5
+ movq [%1+%2], mm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], mm6 ; result of line 7
+ movq [%1+%2], mm4 ; result of line 8
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], mm5 ; result of line 3
+ movq [%3+%2], mm2 ; result of line 4
+ lea %3, [%3+2*%2]
+ movq [%3], mm7 ; result of line 5
+ movq [%3+%2], mm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], mm6 ; result of line 7
+ movq [%3+%2], mm4 ; result of line 8
+%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
+
+; for transpose 16x8
+
+;in: m0, m1, m2, m3, m4, m5, m6, m7
+;out: m4, m2, m3, m7, m5, m1, m6, m0
+%macro TRANSPOSE_8x16B_SSE2 10
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %10, %4
+ SSE2_XSawp bw, %7, %6, %4
+
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %10
+ movdqa %10, %3
+ SSE2_XSawp wd, %7, %4, %3
+
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %10
+ movdqa %10, %5
+ SSE2_XSawp dq, %7, %3, %5
+
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %10
+ movdqa %10, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %10
+%endmacro ; end of TRANSPOSE_8x16B_SSE2
+
+
+%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], xmm3 ; result of line 3
+ movq [%1+%2], xmm7 ; result of line 4
+
+ lea %1, [%1+2*%2]
+ movq [%1], xmm5 ; result of line 5
+ movq [%1+%2], xmm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], xmm6 ; result of line 7
+ movq [%1+%2], xmm0 ; result of line 8
+
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm4 ; result of line 9
+ movhpd [%1+%2], xmm2 ; result of line 10
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm3 ; result of line 11
+ movhpd [%1+%2], xmm7 ; result of line 12
+
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm5 ; result of line 13
+ movhpd [%1+%2], xmm1 ; result of line 14
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm6 ; result of line 15
+ movhpd [%1+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
+
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], xmm3 ; result of line 3
+ movq [%3+%2], xmm7 ; result of line 4
+
+ lea %3, [%3+2*%2]
+ movq [%3], xmm5 ; result of line 5
+ movq [%3+%2], xmm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], xmm6 ; result of line 7
+ movq [%3+%2], xmm0 ; result of line 8
+
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm4 ; result of line 9
+ movhpd [%3+%2], xmm2 ; result of line 10
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm3 ; result of line 11
+ movhpd [%3+%2], xmm7 ; result of line 12
+
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm5 ; result of line 13
+ movhpd [%3+%2], xmm1 ; result of line 14
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm6 ; result of line 15
+ movhpd [%3+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+
+
+SECTION .text
+
+WELS_EXTERN TransposeMatrixBlock16x16_sse2
+; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ mov r4, r7
+ and r4, 0Fh
+ sub r7, 10h
+ sub r7, r4
+ lea r5, [r3+r3*2]
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
+
+ ; bottom 8x16 block
+ lea r2, [r2+r3*4]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+ mov r5, r1
+ sal r5, 4
+ sub r0, r5
+ lea r0, [r0+r1*2+8]
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
+
+ add r7, r4
+ add r7, 10h
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+WELS_EXTERN TransposeMatrixBlocksx16_sse2
+; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ mov r5, r7
+ and r5, 0Fh
+ sub r7, 10h
+ sub r7, r5
+TRANSPOSE_LOOP_SSE2:
+ ; explictly loading next loop data
+ lea r6, [r2+r3*8]
+ push r4
+%rep 8
+ mov r4, [r6]
+ mov r4, [r6+r3]
+ lea r6, [r6+r3*2]
+%endrep
+ pop r4
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
+ lea r2, [r2+r3*2]
+
+ ; bottom 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
+ lea r2, [r2+r3*2]
+ lea r0, [r0+16]
+ dec r4
+ jg near TRANSPOSE_LOOP_SSE2
+
+ add r7, r5
+ add r7, 10h
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
+
+WELS_EXTERN TransposeMatrixBlock8x8_mmx
+; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ sub r7, 8
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+ TRANSPOSE8x8_WRITE_MMX r0, r1
+
+ emms
+ add r7, 8
+ LOAD_4_PARA_POP
+ ret
+
+WELS_EXTERN TransposeMatrixBlocksx8_mmx
+; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r7, 8
+
+ lea r5, [r2+r3*8]
+
+TRANSPOSE_BLOCKS_X8_LOOP_MMX:
+ ; explictly loading next loop data
+%rep 4
+ mov r6, [r5]
+ mov r6, [r5+r3]
+ lea r5, [r5+r3*2]
+%endrep
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+ TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+ lea r0, [r0+8]
+ lea r2, [r2+2*r3]
+ dec r4
+ jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+
+ emms
+ add r7, 8
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
--- /dev/null
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -1,0 +1,225 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+SECTION .text
+
+;**********************************************************************************************************************************
+;
+; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+;
+; \note:
+; src need align with 16 bytes, ref is optional
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
+
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
+
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_16x16_LINE_SSE41
+%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
+
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41E
+
+WELS_EXTERN SampleSad16x16Hor8_sse41
+ ;push ebx
+ ;push esi
+ ;mov eax, [esp+12] ; src
+ ;mov ecx, [esp+16] ; stride_src
+ ;mov ebx, [esp+20] ; ref
+ ;mov edx, [esp+24] ; stride_ref
+ ;mov esi, [esp+28] ; base_cost
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41E r0, r2, r1, r3
+
+ pxor xmm0, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+ punpckhwd xmm7, xmm0
+
+ movdqa xmm5, [r4]
+ movdqa xmm4, xmm5
+ punpcklwd xmm4, xmm0
+ punpckhwd xmm5, xmm0
+
+ paddd xmm4, xmm6
+ paddd xmm5, xmm7
+ movdqa xmm3, xmm4
+ pminud xmm3, xmm5
+ pshufd xmm2, xmm3, 01001110B
+ pminud xmm2, xmm3
+ pshufd xmm3, xmm2, 10110001B
+ pminud xmm2, xmm3
+ movd retrd, xmm2
+ pcmpeqd xmm4, xmm2
+ movmskps r2d, xmm4
+ bsf r1d, r2d
+ jnz near WRITE_INDEX
+
+ pcmpeqd xmm5, xmm2
+ movmskps r2d, xmm5
+ bsf r1d, r2d
+ add r1d, 4
+
+WRITE_INDEX:
+ mov [r5], r1d
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+;**********************************************************************************************************************************
+;
+; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+;
+; \note:
+; src and ref is optional to align with 16 due inter 8x8
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
+;
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
+
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_8x8_LINE_SSE41
+%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
+%endmacro ; end of SAD_8x8_LINE_SSE41E
+
+WELS_EXTERN SampleSad8x8Hor8_sse41
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [r4] ; load base cost list
+
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41E r0, r2, r1, r3
+
+ phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
+ movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+ mov r1d, retrd
+ and retrd, 0xFFFF
+ sar r1d, 16
+ mov [r5], r1d
+
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -40,8 +40,10 @@
$(ENCODER_SRCDIR)/core/x86/coeff.asm\
$(ENCODER_SRCDIR)/core/x86/dct.asm\
$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
+ $(ENCODER_SRCDIR)/core/x86/matrix_transpose.asm\
$(ENCODER_SRCDIR)/core/x86/memzero.asm\
$(ENCODER_SRCDIR)/core/x86/quant.asm\
+ $(ENCODER_SRCDIR)/core/x86/sample_sc.asm\
$(ENCODER_SRCDIR)/core/x86/score.asm\
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))
--- a/test/encoder/EncUT_MotionEstimate.cpp
+++ b/test/encoder/EncUT_MotionEstimate.cpp
@@ -5,6 +5,7 @@
#include "sample.h"
#include "svc_motion_estimate.h"
#include "wels_func_ptr_def.h"
+#include "cpu.h"
using namespace WelsSVCEnc;
@@ -43,11 +44,12 @@
m_iMaxSearchBlock = 16;
m_uiMvdTableSize = (1 + (648 << 1));
+ pMa = new CMemoryAlign(0);
m_pRefPic = static_cast<uint8_t *>
- (malloc(m_iWidth*m_iHeight));
+ (pMa->WelsMalloc(m_iWidth*m_iHeight, "RefPic"));
ASSERT_TRUE( NULL != m_pRefPic );
m_pSrcBlock = static_cast<uint8_t *>
- (malloc(m_iMaxSearchBlock*m_iMaxSearchBlock));
+ (pMa->WelsMalloc(m_iMaxSearchBlock*m_iMaxSearchBlock, "SrcBlock"));
ASSERT_TRUE( NULL != m_pSrcBlock );
m_pMvdCostTable=new uint16_t[52*m_uiMvdTableSize];
ASSERT_TRUE( NULL != m_pMvdCostTable );
@@ -54,8 +56,9 @@
}
virtual void TearDown() {
delete [] m_pMvdCostTable;
- free( m_pRefPic );
- free( m_pSrcBlock );
+ pMa->WelsFree( m_pRefPic, "RefPic");
+ pMa->WelsFree( m_pSrcBlock, "SrcBlock");
+ delete pMa;
}
public:
uint8_t *m_pRefPic;
@@ -66,6 +69,7 @@
int32_t m_iWidth;
int32_t m_iHeight;
int32_t m_iMaxSearchBlock;
+ CMemoryAlign *pMa;
};
@@ -243,4 +247,134 @@
ASSERT_TRUE(iTryTimes > 0);
//it is possible that ref at differnt position is identical, but that should be under a low probability
}
-}
\ No newline at end of file
+}
+
+#ifdef X86_ASM
+TEST_F(MotionEstimateTest, TestVerticalSearch_SSE41)
+{
+ const int32_t kiMaxBlock16Sad = 72000;//a rough number
+ SWelsFuncPtrList sFuncList;
+ SWelsME sMe;
+
+ srand((uint32_t)time(NULL));
+ const uint8_t kuiQp = rand()%52;
+ InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+ SMVUnitXY sTargetMv;
+ WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+ WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+ uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+ sMe.iCurMeBlockPixX = (m_iWidth/2);
+ sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+ bool bDataGeneratorSucceed = false;
+ bool bFoundMatch = false;
+ int32_t iTryTimes=100;
+
+ sTargetMv.iMvX = 0;
+ sTargetMv.iMvY = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iHeight-INTPEL_NEEDED_MARGIN);
+ bDataGeneratorSucceed = false;
+ bFoundMatch = false;
+ while (!bFoundMatch && (iTryTimes--)>0) {
+ if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+ continue;
+
+ bDataGeneratorSucceed = true;
+ CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+ //clean the sMe status
+ sMe.uiBlockSize = rand()%5;
+ sMe.pEncMb = m_pSrcBlock;
+ sMe.pRefMb = pRefPicCenter;
+ sMe.pColoRefMb = pRefPicCenter;
+ sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+ sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+ const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+ const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+ const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+ const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+ uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX; //do the offset here
+ uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+ VerticalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+ pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],
+ m_iMaxSearchBlock, m_iWidth,
+ INTPEL_NEEDED_MARGIN,
+ m_iHeight-INTPEL_NEEDED_MARGIN, true );
+
+ //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+ bFoundMatch = (sMe.sMv.iMvX==0
+ &&(sMe.sMv.iMvY==sTargetMv.iMvY||abs(sMe.sMv.iMvY)<abs(sTargetMv.iMvY)));
+ //printf("TestVerticalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+ }
+ if (bDataGeneratorSucceed) {
+ //if DataGenerator never succeed, there is no meaning to check iTryTimes
+ ASSERT_TRUE(iTryTimes > 0);
+ //it is possible that ref at differnt position is identical, but that should be under a low probability
+ }
+}
+
+TEST_F(MotionEstimateTest, TestHorizontalSearch_SSE41)
+{
+ const int32_t kiMaxBlock16Sad = 72000;//a rough number
+ SWelsFuncPtrList sFuncList;
+ SWelsME sMe;
+
+ srand((uint32_t)time(NULL));
+ const uint8_t kuiQp = rand()%52;
+ InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+ SMVUnitXY sTargetMv;
+ WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+ WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+ uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+ sMe.iCurMeBlockPixX = (m_iWidth/2);
+ sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+ bool bDataGeneratorSucceed = false;
+ bool bFoundMatch = false;
+ int32_t iTryTimes=100;
+
+ sTargetMv.iMvX = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iWidth-INTPEL_NEEDED_MARGIN);
+ sTargetMv.iMvY = 0;
+ bDataGeneratorSucceed = false;
+ bFoundMatch = false;
+ while (!bFoundMatch && (iTryTimes--)>0) {
+ if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+ continue;
+
+ bDataGeneratorSucceed = true;
+ CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+ //clean the sMe status
+ sMe.uiBlockSize = rand()%5;
+ sMe.pEncMb = m_pSrcBlock;
+ sMe.pRefMb = pRefPicCenter;
+ sMe.pColoRefMb = pRefPicCenter;
+ sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+ sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+ const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+ const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+ const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+ const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+ uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX; //do the offset here
+ uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+ HorizontalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+ pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],
+ m_iMaxSearchBlock, m_iWidth,
+ INTPEL_NEEDED_MARGIN,
+ m_iWidth-INTPEL_NEEDED_MARGIN, false );
+
+ //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+ bFoundMatch = (sMe.sMv.iMvY==0
+ &&(sMe.sMv.iMvX==sTargetMv.iMvX||abs(sMe.sMv.iMvX)<abs(sTargetMv.iMvX)));
+ //printf("TestHorizontalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+ }
+ if (bDataGeneratorSucceed) {
+ //if DataGenerator never succeed, there is no meaning to check iTryTimes
+ ASSERT_TRUE(iTryTimes > 0);
+ //it is possible that ref at differnt position is identical, but that should be under a low probability
+ }
+}
+#endif
\ No newline at end of file