ref: 501c77f66b73f207c7b0776aba8991b06861cc78
parent: 94cabe10d54021c8269d51ba3fa5d88c4a0607fe
parent: 90fc914b6c7193223a7705ac5ca36e2558321751
author: volvet <[email protected]>
date: Mon Apr 7 17:47:23 EDT 2014
Merge pull request #637 from zhilwang/ruby-merge Ruby merge
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@@ -81,6 +81,7 @@
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
+ 4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -281,6 +282,7 @@
4CE446A918BC605C0017DF25 /* inc */ = {
isa = PBXGroup;
children = (
+ 4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */,
4CE446AA18BC605C0017DF25 /* as264_common.h */,
4CE446AB18BC605C0017DF25 /* au_set.h */,
4CE446AC18BC605C0017DF25 /* bit_stream.h */,
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@@ -199,11 +199,24 @@
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
+#ifdef X86_ASM
+extern "C"
+{
+uint32_t SampleSad8x8Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+uint32_t SampleSad16x16Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+}
+
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+ uint16_t* pMvdTable, const int32_t kiFixedMvd,
+ const int32_t kiEncStride, const int32_t kiRefStride,
+ const int32_t kiMinPos, const int32_t kiMaxPos,
+ const bool bVerticalSearch );
+#endif
void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList, SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);
// Feature Search Basics
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@@ -87,6 +87,7 @@
#define PARA_SET_TYPE_SUBSETSPS 1
#define PARA_SET_TYPE_PPS 2
+#define MAX_VERTICAL_MV_RANGE 1024 //TODO, for allocate enough memory for transpose
#define MAX_FRAME_RATE 30 // maximal frame rate to support
#define MIN_FRAME_RATE 1 // minimal frame rate need support
--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@@ -134,6 +134,7 @@
typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
uint8_t*, uint8_t*);
+typedef uint32_t (*PSampleSadHor8Func)( uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t* );
typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,
void* pSlice);
typedef void (*PSearchMethodFunc) (SWelsFuncPtrList* pFuncList, void* pMe, void* pSlice, const int32_t kiEncStride, const int32_t kiRefStride);
@@ -202,14 +203,16 @@
PGetIntraPredFunc pfGetLumaI4x4Pred[I4_PRED_A];
PGetIntraPredFunc pfGetChromaPred[C_PRED_A];
+ PSampleSadHor8Func pfSampleSadHor8[2]; // 1: for 16x16 square; 0: for 8x8 square
PMotionSearchFunc
pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
PSearchMethodFunc pfSearchMethod[BLOCK_SIZE_ALL];
PCalculateSatdFunc pfCalculateSatd;
PCheckDirectionalMv pfCheckDirectionalMv;
- PLineFullSearchFunc pfLineFullSearch;
PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16
PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16
+ PLineFullSearchFunc pfVerticalFullSearch;
+ PLineFullSearchFunc pfHorizontalFullSearch;
PCopyFunc pfCopy16x16Aligned; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
PCopyFunc pfCopy16x16NotAligned; //md.c
--- /dev/null
+++ b/codec/encoder/core/inc/wels_transpose_matrix.h
@@ -1,0 +1,54 @@
+/*!
+ * \copy
+ * Copyright (c) 2013, Cisco Systems
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+#define WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+
+#ifdef X86_ASM
+extern "C"
+{
+void TransposeMatrixBlocksx16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock16x16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+void TransposeMatrixBlocksx8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock8x8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+}
+#endif
+
+typedef void (*PTransposeMatrixBlockFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+typedef void (*PTransposeMatrixBlocksFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+
+}// end of namespace declaration
+
+#endif//WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@@ -41,6 +41,7 @@
#include "cpu_core.h"
#include "ls_defines.h"
#include "svc_motion_estimate.h"
+#include "wels_transpose_matrix.h"
namespace WelsSVCEnc {
@@ -65,8 +66,14 @@
pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;
//for cross serarch
- pFuncList->pfLineFullSearch = LineFullSearch_c;
+ pFuncList->pfVerticalFullSearch = LineFullSearch_c;
+ pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
+#if defined (X86_ASM)
if ( uiCpuFlag & WELS_CPU_SSE41 ) {
+ pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
+ pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
+ pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
+ pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
}
//for feature search
@@ -75,6 +82,7 @@
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#endif
}
}
@@ -302,6 +310,17 @@
/////////////////////////
// Cross Search Basics
/////////////////////////
+#if defined (X86_ASM)
+void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
+{
+ uint16_t *pBaseCost = pMvdCost;
+ const int32_t kiOffset = (kiStartMv<<2);
+ uint16_t *pMvd = pMvdTable+kiOffset;
+ for (int32_t i = 0; i < 8; ++ i) {
+ pBaseCost[i] = ((*pMvd) + kiFixedCost);
+ pMvd += 4;
+ }
+}
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
@@ -308,12 +327,130 @@
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch ) {
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
- SWelsME *pMe = static_cast<SWelsME *>(vpMe);
+ SWelsME *pMe = static_cast<SWelsME *>(vpMe);
+ uint8_t* kpEncMb = pMe->pEncMb;
+ const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY;
+ uint8_t* pRef = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
+ const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
+ const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8;
+ PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+ PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+ PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
+ PTransposeMatrixBlocksFunc TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
+
+ const int32_t kiDiff = kiMaxPos - kiMinPos;
+ const int32_t kiRowNum = WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
+ const int32_t kiBlocksNum = kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
+ int32_t iCountLoop8 = (kiRowNum-kiEdgeBlocks) >> 3;
+ const int32_t kiRemainingVectors = kiDiff - (iCountLoop8<<3);
+ const int32_t kiMatrixStride = MAX_VERTICAL_MV_RANGE;
+ ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 ); // transpose matrix result for ref
+ ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 ); // transpose matrix result for enc
+ assert(kiRowNum <= kiMatrixStride); // make sure effective memory
+
+ TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
+ TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
+ ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+ int32_t iTargetPos = kiMinPos;
+ int16_t iBestPos = pMe->sMv.iMvX;
+ uint32_t uiBestCost = pMe->uiSadCost;
+ uint32_t uiCostMin;
+ int32_t iIndexMinPos;
+ kpEncMb = &uiMatrixEnc[0][0];
+ pRef = &uiMatrixRef[0][0];
+
+ while(iCountLoop8 > 0) {
+ CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+ uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
+ if (uiCostMin < uiBestCost) {
+ uiBestCost = uiCostMin;
+ iBestPos = iTargetPos+iIndexMinPos;
+ }
+ iTargetPos += 8;
+ pRef += 8;
+ -- iCountLoop8;
+ }
+ if (kiRemainingVectors > 0) {
+ kpEncMb = pMe->pEncMb;
+ pRef = &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
+ while (iTargetPos < kiMaxPos) {
+ const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
+ uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+ if (uiSadCost < uiBestCost) {
+ uiBestCost = uiSadCost;
+ iBestPos = iTargetPos;
+ }
+ pRef += kiRefStride;
+ ++iTargetPos;
+ }
+ }
+ if (uiBestCost < pMe->uiSadCost) {
+ SMVUnitXY sBestMv;
+ sBestMv.iMvX = 0;
+ sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
+ UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );
+ }
}
-void LineFullSearch_c( void *pFunc, void *vpMe,
- uint16_t* pMvdTable, const int32_t kiFixedMvd,
- const int32_t kiEncStride, const int32_t kiRefStride,
- const int32_t kiMinPos, const int32_t kiMaxPos,
+
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+ uint16_t* pMvdTable, const int32_t kiFixedMvd,
+ const int32_t kiEncStride, const int32_t kiRefStride,
+ const int32_t kiMinPos, const int32_t kiMaxPos,
+ const bool bVerticalSearch )
+{
+ SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
+ SWelsME *pMe = static_cast<SWelsME *>(vpMe);
+ uint8_t *kpEncMb = pMe->pEncMb;
+ const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixX;
+ uint8_t *pRef = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
+ const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
+ PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+ PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+ ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+ const int32_t kiNumVector = kiMaxPos - kiMinPos;
+ int32_t iCountLoop8 = kiNumVector >> 3;
+ const int32_t kiRemainingLoop8 = kiNumVector & 7;
+ int32_t iTargetPos = kiMinPos;
+ int16_t iBestPos = pMe->sMv.iMvX;
+ uint32_t uiBestCost = pMe->uiSadCost;
+ uint32_t uiCostMin;
+ int32_t iIndexMinPos;
+
+ while(iCountLoop8 > 0) {
+ CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+ uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
+ if (uiCostMin < uiBestCost) {
+ uiBestCost = uiCostMin;
+ iBestPos = iTargetPos+iIndexMinPos;
+ }
+ iTargetPos += 8;
+ pRef += 8;
+ -- iCountLoop8;
+ }
+ if ( kiRemainingLoop8 > 0 ) {
+ while (iTargetPos < kiMaxPos) {
+ const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
+ uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+ if (uiSadCost < uiBestCost) {
+ uiBestCost = uiSadCost;
+ iBestPos = iTargetPos;
+ }
+ ++pRef;
+ ++iTargetPos;
+ }
+ }
+ if (uiBestCost < pMe->uiSadCost) {
+ SMVUnitXY sBestMv;
+ sBestMv.iMvX = iBestPos - kiCurMeBlockPix;
+ sBestMv.iMvY = 0;
+ UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );
+ }
+}
+#endif
+void LineFullSearch_c( void *pFunc, void *vpMe,
+ uint16_t* pMvdTable, const int32_t kiFixedMvd,
+ const int32_t kiEncStride, const int32_t kiRefStride,
+ const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch ) {
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
SWelsME *pMe = static_cast<SWelsME *>(vpMe);
@@ -346,8 +483,8 @@
void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList, SWelsME * pMe,
const SSlice* pSlice, const int32_t kiEncStride, const int32_t kiRefStride) {
- PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfLineFullSearch;
- PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfLineFullSearch;
+ PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch;
+ PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch;
const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
--- /dev/null
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@@ -1,0 +1,395 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* ?Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* ?Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;in: m0, m1, m2, m3, m4, m5, m6, m7
+;out: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE_8x8B_MMX 10
+ MMX_XSwap bw, %1, %2, %8
+ MMX_XSwap bw, %3, %4, %2
+ MMX_XSwap bw, %5, %6, %4
+ movq %6, %9
+ movq %10, %4
+ MMX_XSwap bw, %7, %6, %4
+
+ MMX_XSwap wd, %1, %3, %6
+ MMX_XSwap wd, %8, %2, %3
+ MMX_XSwap wd, %5, %7, %2
+ movq %7, %10
+ movq %10, %3
+ MMX_XSwap wd, %7, %4, %3
+
+ MMX_XSwap dq, %1, %5, %4
+ MMX_XSwap dq, %6, %2, %5
+ MMX_XSwap dq, %8, %7, %2
+ movq %7, %10
+ movq %10, %5
+ MMX_XSwap dq, %7, %3, %5
+
+ movq %3, %10
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], mm5 ; result of line 3
+ movq [%1+%2], mm2 ; result of line 4
+ lea %1, [%1+2*%2]
+ movq [%1], mm7 ; result of line 5
+ movq [%1+%2], mm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], mm6 ; result of line 7
+ movq [%1+%2], mm4 ; result of line 8
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
+ movq [%1], mm0 ; result of line 1, x8 bytes
+ movq [%1+%2], mm3 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], mm5 ; result of line 3
+ movq [%3+%2], mm2 ; result of line 4
+ lea %3, [%3+2*%2]
+ movq [%3], mm7 ; result of line 5
+ movq [%3+%2], mm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], mm6 ; result of line 7
+ movq [%3+%2], mm4 ; result of line 8
+%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
+
+; for transpose 16x8
+
+;in: m0, m1, m2, m3, m4, m5, m6, m7
+;out: m4, m2, m3, m7, m5, m1, m6, m0
+%macro TRANSPOSE_8x16B_SSE2 10
+ SSE2_XSawp bw, %1, %2, %8
+ SSE2_XSawp bw, %3, %4, %2
+ SSE2_XSawp bw, %5, %6, %4
+ movdqa %6, %9
+ movdqa %10, %4
+ SSE2_XSawp bw, %7, %6, %4
+
+ SSE2_XSawp wd, %1, %3, %6
+ SSE2_XSawp wd, %8, %2, %3
+ SSE2_XSawp wd, %5, %7, %2
+ movdqa %7, %10
+ movdqa %10, %3
+ SSE2_XSawp wd, %7, %4, %3
+
+ SSE2_XSawp dq, %1, %5, %4
+ SSE2_XSawp dq, %6, %2, %5
+ SSE2_XSawp dq, %8, %7, %2
+ movdqa %7, %10
+ movdqa %10, %5
+ SSE2_XSawp dq, %7, %3, %5
+
+ SSE2_XSawp qdq, %1, %8, %3
+ SSE2_XSawp qdq, %4, %2, %8
+ SSE2_XSawp qdq, %6, %7, %2
+ movdqa %7, %10
+ movdqa %10, %1
+ SSE2_XSawp qdq, %7, %5, %1
+ movdqa %5, %10
+%endmacro ; end of TRANSPOSE_8x16B_SSE2
+
+
+%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %1, [%1+2*%2]
+ movq [%1], xmm3 ; result of line 3
+ movq [%1+%2], xmm7 ; result of line 4
+
+ lea %1, [%1+2*%2]
+ movq [%1], xmm5 ; result of line 5
+ movq [%1+%2], xmm1 ; result of line 6
+ lea %1, [%1+2*%2]
+ movq [%1], xmm6 ; result of line 7
+ movq [%1+%2], xmm0 ; result of line 8
+
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm4 ; result of line 9
+ movhpd [%1+%2], xmm2 ; result of line 10
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm3 ; result of line 11
+ movhpd [%1+%2], xmm7 ; result of line 12
+
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm5 ; result of line 13
+ movhpd [%1+%2], xmm1 ; result of line 14
+ lea %1, [%1+2*%2]
+ movhpd [%1], xmm6 ; result of line 15
+ movhpd [%1+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
+
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
+ movq [%1], xmm4 ; result of line 1, x8 bytes
+ movq [%1+%2], xmm2 ; result of line 2
+ lea %3, [%1+2*%2]
+ movq [%3], xmm3 ; result of line 3
+ movq [%3+%2], xmm7 ; result of line 4
+
+ lea %3, [%3+2*%2]
+ movq [%3], xmm5 ; result of line 5
+ movq [%3+%2], xmm1 ; result of line 6
+ lea %3, [%3+2*%2]
+ movq [%3], xmm6 ; result of line 7
+ movq [%3+%2], xmm0 ; result of line 8
+
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm4 ; result of line 9
+ movhpd [%3+%2], xmm2 ; result of line 10
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm3 ; result of line 11
+ movhpd [%3+%2], xmm7 ; result of line 12
+
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm5 ; result of line 13
+ movhpd [%3+%2], xmm1 ; result of line 14
+ lea %3, [%3+2*%2]
+ movhpd [%3], xmm6 ; result of line 15
+ movhpd [%3+%2], xmm0 ; result of line 16
+%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+
+
+SECTION .text
+
+WELS_EXTERN TransposeMatrixBlock16x16_sse2
+; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
+ push r4
+ push r5
+ %assign push_num 2
+ LOAD_4_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+
+ mov r4, r7
+ and r4, 0Fh
+ sub r7, 10h
+ sub r7, r4
+ lea r5, [r3+r3*2]
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
+
+ ; bottom 8x16 block
+ lea r2, [r2+r3*4]
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ movdqa xmm2, [r2+r3*2]
+ movdqa xmm3, [r2+r5]
+ lea r2, [r2+r3*4]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ movdqa xmm6, [r2+r3*2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+ mov r5, r1
+ sal r5, 4
+ sub r0, r5
+ lea r0, [r0+r1*2+8]
+ TRANSPOSE8x16_WRITE_SSE2 r0, r1
+
+ add r7, r4
+ add r7, 10h
+ POP_XMM
+ LOAD_4_PARA_POP
+ pop r5
+ pop r4
+ ret
+
+WELS_EXTERN TransposeMatrixBlocksx16_sse2
+; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ mov r5, r7
+ and r5, 0Fh
+ sub r7, 10h
+ sub r7, r5
+TRANSPOSE_LOOP_SSE2:
+ ; explictly loading next loop data
+ lea r6, [r2+r3*8]
+ push r4
+%rep 8
+ mov r4, [r6]
+ mov r4, [r6+r3]
+ lea r6, [r6+r3*2]
+%endrep
+ pop r4
+ ; top 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
+ lea r2, [r2+r3*2]
+
+ ; bottom 8x16 block
+ movdqa xmm0, [r2]
+ movdqa xmm1, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm4, [r2]
+ movdqa xmm5, [r2+r3]
+ lea r2, [r2+r3*2]
+ movdqa xmm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m4, m2, m3, m7, m5, m1, m6, m0
+ TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+ TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
+ lea r2, [r2+r3*2]
+ lea r0, [r0+16]
+ dec r4
+ jg near TRANSPOSE_LOOP_SSE2
+
+ add r7, r5
+ add r7, 10h
+ POP_XMM
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
+
+WELS_EXTERN TransposeMatrixBlock8x8_mmx
+; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
+ %assign push_num 0
+ LOAD_4_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ sub r7, 8
+
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+ TRANSPOSE8x8_WRITE_MMX r0, r1
+
+ emms
+ add r7, 8
+ LOAD_4_PARA_POP
+ ret
+
+WELS_EXTERN TransposeMatrixBlocksx8_mmx
+; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
+ push r5
+ push r6
+ %assign push_num 2
+ LOAD_5_PARA
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ SIGN_EXTENSION r4, r4d
+ sub r7, 8
+
+ lea r5, [r2+r3*8]
+
+TRANSPOSE_BLOCKS_X8_LOOP_MMX:
+ ; explictly loading next loop data
+%rep 4
+ mov r6, [r5]
+ mov r6, [r5+r3]
+ lea r5, [r5+r3*2]
+%endrep
+ movq mm0, [r2]
+ movq mm1, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm2, [r2]
+ movq mm3, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm4, [r2]
+ movq mm5, [r2+r3]
+ lea r2, [r2+2*r3]
+ movq mm6, [r2]
+
+ ;in: m0, m1, m2, m3, m4, m5, m6, m7
+ ;out: m0, m3, m5, m2, m7, m1, m6, m4
+ TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+ TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+ lea r0, [r0+8]
+ lea r2, [r2+2*r3]
+ dec r4
+ jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+
+ emms
+ add r7, 8
+ LOAD_5_PARA_POP
+ pop r6
+ pop r5
+ ret
--- /dev/null
+++ b/codec/encoder/core/x86/sample_sc.asm
@@ -1,0 +1,225 @@
+;*!
+;* \copy
+;* Copyright (c) 2009-2013, Cisco Systems
+;* All rights reserved.
+;*
+;* Redistribution and use in source and binary forms, with or without
+;* modification, are permitted provided that the following conditions
+;* are met:
+;*
+;* * Redistributions of source code must retain the above copyright
+;* notice, this list of conditions and the following disclaimer.
+;*
+;* * Redistributions in binary form must reproduce the above copyright
+;* notice, this list of conditions and the following disclaimer in
+;* the documentation and/or other materials provided with the
+;* distribution.
+;*
+;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;* POSSIBILITY OF SUCH DAMAGE.
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+SECTION .text
+
+;**********************************************************************************************************************************
+;
+; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+;
+; \note:
+; src need align with 16 bytes, ref is optional
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
+
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
+
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_16x16_LINE_SSE41
+%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqa xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqu xmm2, [%2+8h]
+ movdqa xmm3, xmm1
+ movdqa xmm4, xmm2
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm3, xmm0, 5 ; 101 B
+ paddw xmm7, xmm3 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 2 ; 010 B
+ paddw xmm7, xmm2 ; accumulate cost
+
+ mpsadbw xmm4, xmm0, 7 ; 111 B
+ paddw xmm7, xmm4 ; accumulate cost
+%endmacro ; end of SAD_16x16_LINE_SSE41E
+
+WELS_EXTERN SampleSad16x16Hor8_sse41
+ ;push ebx
+ ;push esi
+ ;mov eax, [esp+12] ; src
+ ;mov ecx, [esp+16] ; stride_src
+ ;mov ebx, [esp+20] ; ref
+ ;mov edx, [esp+24] ; stride_ref
+ ;mov esi, [esp+28] ; base_cost
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ pxor xmm7, xmm7
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41 r0, r2, r1, r3
+ SAD_16x16_LINE_SSE41E r0, r2, r1, r3
+
+ pxor xmm0, xmm0
+ movdqa xmm6, xmm7
+ punpcklwd xmm6, xmm0
+ punpckhwd xmm7, xmm0
+
+ movdqa xmm5, [r4]
+ movdqa xmm4, xmm5
+ punpcklwd xmm4, xmm0
+ punpckhwd xmm5, xmm0
+
+ paddd xmm4, xmm6
+ paddd xmm5, xmm7
+ movdqa xmm3, xmm4
+ pminud xmm3, xmm5
+ pshufd xmm2, xmm3, 01001110B
+ pminud xmm2, xmm3
+ pshufd xmm3, xmm2, 10110001B
+ pminud xmm2, xmm3
+ movd retrd, xmm2
+ pcmpeqd xmm4, xmm2
+ movmskps r2d, xmm4
+ bsf r1d, r2d
+ jnz near WRITE_INDEX
+
+ pcmpeqd xmm5, xmm2
+ movmskps r2d, xmm5
+ bsf r1d, r2d
+ add r1d, 4
+
+WRITE_INDEX:
+ mov [r5], r1d
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
+;**********************************************************************************************************************************
+;
+; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+;
+; \note:
+; src and ref is optional to align with 16 due inter 8x8
+; \return value:
+; return minimal SAD cost, according index carried by index_min_cost
+;
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
+
+ add %1, %3
+ add %2, %4
+%endmacro ; end of SAD_8x8_LINE_SSE41
+%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
+ movdqu xmm0, [%1]
+ movdqu xmm1, [%2]
+ movdqa xmm2, xmm1
+
+ mpsadbw xmm1, xmm0, 0 ; 000 B
+ paddw xmm7, xmm1 ; accumulate cost
+
+ mpsadbw xmm2, xmm0, 5 ; 101 B
+ paddw xmm7, xmm2 ; accumulate cost
+%endmacro ; end of SAD_8x8_LINE_SSE41E
+
+WELS_EXTERN SampleSad8x8Hor8_sse41
+ %assign push_num 0
+ LOAD_6_PARA
+ PUSH_XMM 8
+ SIGN_EXTENSION r1, r1d
+ SIGN_EXTENSION r3, r3d
+ movdqa xmm7, [r4] ; load base cost list
+
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41 r0, r2, r1, r3
+ SAD_8x8_LINE_SSE41E r0, r2, r1, r3
+
+ phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
+ movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+ mov r1d, retrd
+ and retrd, 0xFFFF
+ sar r1d, 16
+ mov [r5], r1d
+
+ POP_XMM
+ LOAD_6_PARA_POP
+ ret
+
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@@ -40,8 +40,10 @@
$(ENCODER_SRCDIR)/core/x86/coeff.asm\
$(ENCODER_SRCDIR)/core/x86/dct.asm\
$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
+ $(ENCODER_SRCDIR)/core/x86/matrix_transpose.asm\
$(ENCODER_SRCDIR)/core/x86/memzero.asm\
$(ENCODER_SRCDIR)/core/x86/quant.asm\
+ $(ENCODER_SRCDIR)/core/x86/sample_sc.asm\
$(ENCODER_SRCDIR)/core/x86/score.asm\
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))
--- a/test/encoder/EncUT_MotionEstimate.cpp
+++ b/test/encoder/EncUT_MotionEstimate.cpp
@@ -5,6 +5,7 @@
#include "sample.h"
#include "svc_motion_estimate.h"
#include "wels_func_ptr_def.h"
+#include "cpu.h"
using namespace WelsSVCEnc;
@@ -43,11 +44,12 @@
m_iMaxSearchBlock = 16;
m_uiMvdTableSize = (1 + (648 << 1));
+ pMa = new CMemoryAlign(0);
m_pRefPic = static_cast<uint8_t *>
- (malloc(m_iWidth*m_iHeight));
+ (pMa->WelsMalloc(m_iWidth*m_iHeight, "RefPic"));
ASSERT_TRUE( NULL != m_pRefPic );
m_pSrcBlock = static_cast<uint8_t *>
- (malloc(m_iMaxSearchBlock*m_iMaxSearchBlock));
+ (pMa->WelsMalloc(m_iMaxSearchBlock*m_iMaxSearchBlock, "SrcBlock"));
ASSERT_TRUE( NULL != m_pSrcBlock );
m_pMvdCostTable=new uint16_t[52*m_uiMvdTableSize];
ASSERT_TRUE( NULL != m_pMvdCostTable );
@@ -54,8 +56,9 @@
}
virtual void TearDown() {
delete [] m_pMvdCostTable;
- free( m_pRefPic );
- free( m_pSrcBlock );
+ pMa->WelsFree( m_pRefPic, "RefPic");
+ pMa->WelsFree( m_pSrcBlock, "SrcBlock");
+ delete pMa;
}
public:
uint8_t *m_pRefPic;
@@ -66,6 +69,7 @@
int32_t m_iWidth;
int32_t m_iHeight;
int32_t m_iMaxSearchBlock;
+ CMemoryAlign *pMa;
};
@@ -243,4 +247,134 @@
ASSERT_TRUE(iTryTimes > 0);
//it is possible that ref at differnt position is identical, but that should be under a low probability
}
-}
\ No newline at end of file
+}
+
+#ifdef X86_ASM
+TEST_F(MotionEstimateTest, TestVerticalSearch_SSE41)
+{
+ const int32_t kiMaxBlock16Sad = 72000;//a rough number
+ SWelsFuncPtrList sFuncList;
+ SWelsME sMe;
+
+ srand((uint32_t)time(NULL));
+ const uint8_t kuiQp = rand()%52;
+ InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+ SMVUnitXY sTargetMv;
+ WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+ WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+ uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+ sMe.iCurMeBlockPixX = (m_iWidth/2);
+ sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+ bool bDataGeneratorSucceed = false;
+ bool bFoundMatch = false;
+ int32_t iTryTimes=100;
+
+ sTargetMv.iMvX = 0;
+ sTargetMv.iMvY = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iHeight-INTPEL_NEEDED_MARGIN);
+ bDataGeneratorSucceed = false;
+ bFoundMatch = false;
+ while (!bFoundMatch && (iTryTimes--)>0) {
+ if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+ continue;
+
+ bDataGeneratorSucceed = true;
+ CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+ //clean the sMe status
+ sMe.uiBlockSize = rand()%5;
+ sMe.pEncMb = m_pSrcBlock;
+ sMe.pRefMb = pRefPicCenter;
+ sMe.pColoRefMb = pRefPicCenter;
+ sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+ sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+ const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+ const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+ const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+ const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+ uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX; //do the offset here
+ uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+ VerticalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+ pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],
+ m_iMaxSearchBlock, m_iWidth,
+ INTPEL_NEEDED_MARGIN,
+ m_iHeight-INTPEL_NEEDED_MARGIN, true );
+
+ //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+ bFoundMatch = (sMe.sMv.iMvX==0
+ &&(sMe.sMv.iMvY==sTargetMv.iMvY||abs(sMe.sMv.iMvY)<abs(sTargetMv.iMvY)));
+ //printf("TestVerticalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+ }
+ if (bDataGeneratorSucceed) {
+ //if DataGenerator never succeed, there is no meaning to check iTryTimes
+ ASSERT_TRUE(iTryTimes > 0);
+ //it is possible that ref at differnt position is identical, but that should be under a low probability
+ }
+}
+
+TEST_F(MotionEstimateTest, TestHorizontalSearch_SSE41)
+{
+ const int32_t kiMaxBlock16Sad = 72000;//a rough number
+ SWelsFuncPtrList sFuncList;
+ SWelsME sMe;
+
+ srand((uint32_t)time(NULL));
+ const uint8_t kuiQp = rand()%52;
+ InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+ SMVUnitXY sTargetMv;
+ WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+ WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+ uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+ sMe.iCurMeBlockPixX = (m_iWidth/2);
+ sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+ bool bDataGeneratorSucceed = false;
+ bool bFoundMatch = false;
+ int32_t iTryTimes=100;
+
+ sTargetMv.iMvX = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iWidth-INTPEL_NEEDED_MARGIN);
+ sTargetMv.iMvY = 0;
+ bDataGeneratorSucceed = false;
+ bFoundMatch = false;
+ while (!bFoundMatch && (iTryTimes--)>0) {
+ if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+ continue;
+
+ bDataGeneratorSucceed = true;
+ CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+ //clean the sMe status
+ sMe.uiBlockSize = rand()%5;
+ sMe.pEncMb = m_pSrcBlock;
+ sMe.pRefMb = pRefPicCenter;
+ sMe.pColoRefMb = pRefPicCenter;
+ sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+ sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+ const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+ const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+ const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+ const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+ uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX; //do the offset here
+ uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+ HorizontalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+ pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],
+ m_iMaxSearchBlock, m_iWidth,
+ INTPEL_NEEDED_MARGIN,
+ m_iWidth-INTPEL_NEEDED_MARGIN, false );
+
+ //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+ bFoundMatch = (sMe.sMv.iMvY==0
+ &&(sMe.sMv.iMvX==sTargetMv.iMvX||abs(sMe.sMv.iMvX)<abs(sTargetMv.iMvX)));
+ //printf("TestHorizontalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+ }
+ if (bDataGeneratorSucceed) {
+ //if DataGenerator never succeed, there is no meaning to check iTryTimes
+ ASSERT_TRUE(iTryTimes > 0);
+ //it is possible that ref at differnt position is identical, but that should be under a low probability
+ }
+}
+#endif
\ No newline at end of file