ref: ea40ce07a8b12df7f68ea1af6a8715d7a012b214
dir: /codec/encoder/core/src/slice_multi_threading.cpp/
/*! * \copy * Copyright (c) 2010-2013, Cisco Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * * \file slice_multi_threading.h * * \brief pSlice based multiple threading * * \date 04/16/2010 Created * ************************************************************************************* */ #if defined(MT_ENABLED) #include <assert.h> #if defined(__GNUC__) && !defined(_WIN32) #include <semaphore.h> #ifndef SEM_NAME_MAX // length of semaphore name should be system constrained at least on mac 10.7 #define SEM_NAME_MAX 32 #endif//SEM_NAME_MAX #endif//__GNUC__ #include "slice_multi_threading.h" #include "mt_defs.h" #include "nal_encap.h" #include "utils.h" #include "encoder.h" #include "svc_encode_slice.h" #include "deblocking.h" #include "svc_enc_golomb.h" #include "crt_util_safe_x.h" // for safe crt like calls #include "rc.h" #if defined(X86_ASM) #include "cpu.h" #endif//X86_ASM #include "measure_time.h" namespace WelsSVCEnc { void UpdateMbListNeighborParallel (SSliceCtx* pSliceCtx, SMB* pMbList, const int32_t uiSliceIdc) { const uint8_t* kpMbMap = pSliceCtx->pOverallMbMap; const int32_t kiMbWidth = pSliceCtx->iMbWidth; int32_t iIdx = pSliceCtx->pFirstMbInSlice[uiSliceIdc]; const int32_t kiEndMbInSlice = iIdx + pSliceCtx->pCountMbNumInSlice[uiSliceIdc] - 1; do { SMB* pMb = &pMbList[iIdx]; uint32_t uiNeighborAvailFlag = 0; const int32_t kiMbXY = pMb->iMbXY; const int32_t kiMbX = pMb->iMbX; const int32_t kiMbY = pMb->iMbY; bool bLeft; bool bTop; bool bLeftTop; bool bRightTop; int32_t iLeftXY, iTopXY, iLeftTopXY, iRightTopXY; iLeftXY = kiMbXY - 1; iTopXY = kiMbXY - kiMbWidth; iLeftTopXY = iTopXY - 1; iRightTopXY = iTopXY + 1; bLeft = (kiMbX > 0) && (uiSliceIdc == kpMbMap[iLeftXY]); bTop = (kiMbY > 0) && (uiSliceIdc == kpMbMap[iTopXY]); bLeftTop = (kiMbX > 0) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iLeftTopXY]); bRightTop = (kiMbX < (kiMbWidth - 1)) && (kiMbY > 0) && (uiSliceIdc == kpMbMap[iRightTopXY]); if (bLeft) { uiNeighborAvailFlag |= LEFT_MB_POS; } if (bTop) { uiNeighborAvailFlag |= TOP_MB_POS; } if (bLeftTop) { uiNeighborAvailFlag |= TOPLEFT_MB_POS; } if (bRightTop) { uiNeighborAvailFlag |= TOPRIGHT_MB_POS; } pMb->uiNeighborAvail = (uint8_t)uiNeighborAvailFlag; pMb->uiSliceIdc = uiSliceIdc; ++ iIdx; } while (iIdx <= kiEndMbInSlice); } void CalcSliceComplexRatio (void* pRatio, SSliceCtx* pSliceCtx, uint32_t* pSliceConsume) { float* pRatioList = (float*)pRatio; float fAvI[MAX_SLICES_NUM]; float fSumAv = .0f; uint32_t* pSliceTime = (uint32_t*)pSliceConsume; int32_t* pCountMbInSlice = (int32_t*)pSliceCtx->pCountMbNumInSlice; const int32_t kiSliceCount = pSliceCtx->iSliceNumInFrame; int32_t iSliceIdx = 0; #if defined(X86_ASM) WelsEmms(); #endif //X86_ASM while (iSliceIdx < kiSliceCount) { fAvI[iSliceIdx] = 1.0f * pCountMbInSlice[iSliceIdx] / pSliceTime[iSliceIdx]; #if defined(ENABLE_TRACE_MT) WelsLog (NULL, WELS_LOG_DEBUG, "[MT] CalcSliceComplexRatio(), pSliceConsumeTime[%d]= %d us, slice_run= %d\n", iSliceIdx, pSliceTime[iSliceIdx], pCountMbInSlice[iSliceIdx]); #endif//ENABLE_TRACE_MT fSumAv += fAvI[iSliceIdx]; ++ iSliceIdx; } while (-- iSliceIdx >= 0) { pRatioList[iSliceIdx] = fAvI[iSliceIdx] / fSumAv; } } #if defined(MT_ENABLED) int32_t NeedDynamicAdjust (void* pConsumeTime, const int32_t iSliceNum) { uint32_t* pSliceConsume = (uint32_t*)pConsumeTime; uint32_t uiTotalConsume = 0; int32_t iSliceIdx = 0; int32_t iNeedAdj = false; #if defined(X86_ASM) WelsEmms(); #endif //X86_ASM while (iSliceIdx < iSliceNum) { uiTotalConsume += pSliceConsume[iSliceIdx] + pSliceConsume[1 + iSliceIdx]; iSliceIdx += 2; } if (uiTotalConsume == 0) { #if defined(ENABLE_TRACE_MT) WelsLog (NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein do no adjust due first picture, iCountSliceNum= %d\n", iSliceNum); #endif//ENABLE_TRACE_MT return false; } iSliceIdx = 0; float fThr = EPSN; // threshold for various cores cases float fRmse = .0f; // root mean square error of pSlice consume ratios const float kfMeanRatio = 1.0f / iSliceNum; do { const float fRatio = 1.0f * pSliceConsume[iSliceIdx] / uiTotalConsume; const float fDiffRatio = fRatio - kfMeanRatio; fRmse += (fDiffRatio * fDiffRatio); ++ iSliceIdx; } while (iSliceIdx + 1 < iSliceNum); fRmse = sqrtf (fRmse / iSliceNum); if (iSliceNum >= 8) { fThr += THRESHOLD_RMSE_CORE8; } else if (iSliceNum >= 4) { fThr += THRESHOLD_RMSE_CORE4; } else if (iSliceNum >= 2) { fThr += THRESHOLD_RMSE_CORE2; } else fThr = 1.0f; if (fRmse > fThr) iNeedAdj = true; #if defined(ENABLE_TRACE_MT) WelsLog (NULL, WELS_LOG_DEBUG, "[MT] NeedDynamicAdjust(), herein adjustment decision is made (iNeedAdj= %d) by: fRmse of pSlice complexity ratios %.6f, the corresponding threshold %.6f, iCountSliceNum %d\n", iNeedAdj, fRmse, fThr, iSliceNum); #endif//ENABLE_TRACE_MT return iNeedAdj; } #endif void DynamicAdjustSlicing (sWelsEncCtx* pCtx, SDqLayer* pCurDqLayer, void* pComplexRatio, int32_t iCurDid) { SSliceCtx* pSliceCtx = pCurDqLayer->pSliceEncCtx; const int32_t kiCountSliceNum = pSliceCtx->iSliceNumInFrame; const int32_t kiCountNumMb = pSliceCtx->iMbNumInFrame; int32_t iMinimalMbNum = pSliceCtx->iMbWidth; // in theory we need only 1 SMB, here let it as one SMB row required int32_t iMaximalMbNum = 0; // dynamically assign later float* pSliceComplexRatio = (float*)pComplexRatio; int32_t iMbNumLeft = kiCountNumMb; int32_t iRunLen[MAX_THREADS_NUM] = {0}; int32_t iSliceIdx = 0; int32_t iNumMbInEachGom; SWelsSvcRc* pWelsSvcRc = &pCtx->pWelsSvcRc[iCurDid]; if (pCtx->pSvcParam->bEnableRc) { iNumMbInEachGom = pWelsSvcRc->iNumberMbGom; if (iNumMbInEachGom <= 0) { WelsLog (pCtx, WELS_LOG_ERROR, "[MT] DynamicAdjustSlicing(), invalid iNumMbInEachGom= %d from RC, iDid= %d, iCountNumMb= %d\n", iNumMbInEachGom, iCurDid, kiCountNumMb); return; } // do not adjust in case no extra iNumMbInEachGom based left for slicing adjustment, // extra MB of non integrated GOM assigned at the last pSlice in default, keep up on early initial result. if (iNumMbInEachGom * kiCountSliceNum >= kiCountNumMb) { return; } iMinimalMbNum = iNumMbInEachGom; } if (kiCountSliceNum < 2 || (kiCountSliceNum & 0x01)) // we need suppose uiSliceNum is even for multiple threading return; iMaximalMbNum = kiCountNumMb - (kiCountSliceNum - 1) * iMinimalMbNum; #if defined(X86_ASM) WelsEmms(); #endif //X86_ASM #if defined(ENABLE_TRACE_MT) WelsLog (pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iDid= %d, iCountNumMb= %d\n", iCurDid, kiCountNumMb); #endif//ENABLE_TRACE_MT iSliceIdx = 0; while (iSliceIdx + 1 < kiCountSliceNum) { int32_t iNumMbAssigning = (int32_t) (kiCountNumMb * pSliceComplexRatio[iSliceIdx] + EPSN); // GOM boundary aligned if (pCtx->pSvcParam->bEnableRc) { iNumMbAssigning = (int32_t) (1.0f * iNumMbAssigning / iNumMbInEachGom + 0.5f + EPSN) * iNumMbInEachGom; } // make sure one GOM at least in each pSlice for safe if (iNumMbAssigning < iMinimalMbNum) iNumMbAssigning = iMinimalMbNum; else if (iNumMbAssigning > iMaximalMbNum) iNumMbAssigning = iMaximalMbNum; assert (iNumMbAssigning > 0); iMbNumLeft -= iNumMbAssigning; if (iMbNumLeft <= 0) { // error due to we can not support slice_skip now yet, do not adjust this time assert (0); return; } iRunLen[iSliceIdx] = iNumMbAssigning; #if defined(ENABLE_TRACE_MT) WelsLog (pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), uiSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n", iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iNumMbAssigning); #endif//ENABLE_TRACE_MT ++ iSliceIdx; iMaximalMbNum = iMbNumLeft - (kiCountSliceNum - iSliceIdx - 1) * iMinimalMbNum; // get maximal num_mb in left parts } iRunLen[iSliceIdx] = iMbNumLeft; #if defined(ENABLE_TRACE_MT) WelsLog (pCtx, WELS_LOG_DEBUG, "[MT] DynamicAdjustSlicing(), iSliceIdx= %d, pSliceComplexRatio= %.2f, slice_run_org= %d, slice_run_adj= %d\n", iSliceIdx, pSliceComplexRatio[iSliceIdx], pSliceCtx->pCountMbNumInSlice[iSliceIdx], iMbNumLeft); #endif//ENABLE_TRACE_MT if (DynamicAdjustSlicePEncCtxAll (pSliceCtx, iRunLen) == 0) { const int32_t kiThreadNum = pCtx->pSvcParam->iCountThreadsNum; int32_t iThreadIdx = 0; do { #ifdef _WIN32 WelsEventSignal (&pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx]); #else WelsEventSignal (pCtx->pSliceThreading->pUpdateMbListEvent[iThreadIdx]); #endif//_WIN32 ++ iThreadIdx; } while (iThreadIdx < kiThreadNum); WelsMultipleEventsWaitAllBlocking (kiThreadNum, &pCtx->pSliceThreading->pFinUpdateMbListEvent[0]); } } int32_t RequestMtResource (sWelsEncCtx** ppCtx, SWelsSvcCodingParam* pCodingParam, const int32_t iCountBsLen, const int32_t iTargetSpatialBsSize) { CMemoryAlign* pMa = NULL; SWelsSvcCodingParam* pPara = NULL; SSliceThreading* pSmt = NULL; SWelsSliceBs* pSliceB = NULL; uint8_t* pBsBase = NULL; int32_t iNumSpatialLayers = 0; int32_t iThreadNum = 0; int32_t iIdx = 0; int32_t iSliceBsBufferSize = 0; int16_t iMaxSliceNum = 1; int32_t iReturn = ENC_RETURN_SUCCESS; if (NULL == ppCtx || NULL == pCodingParam || NULL == *ppCtx || iCountBsLen <= 0) return 1; pMa = (*ppCtx)->pMemAlign; pPara = pCodingParam; iNumSpatialLayers = pPara->iSpatialLayerNum; iThreadNum = pPara->iCountThreadsNum; iMaxSliceNum = (*ppCtx)->iMaxSliceCount; pSmt = (SSliceThreading*)pMa->WelsMalloc (sizeof (SSliceThreading), "SSliceThreading"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt), FreeMemorySvc (ppCtx)) (*ppCtx)->pSliceThreading = pSmt; pSmt->pThreadPEncCtx = (SSliceThreadPrivateData*)pMa->WelsMalloc (sizeof (SSliceThreadPrivateData) * iThreadNum, "pThreadPEncCtx"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pThreadPEncCtx), FreeMemorySvc (ppCtx)) pSmt->pThreadHandles = (WELS_THREAD_HANDLE*)pMa->WelsMalloc (sizeof (WELS_THREAD_HANDLE) * iThreadNum, "pThreadHandles"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pThreadHandles), FreeMemorySvc (ppCtx)) #ifdef _WIN32 pSmt->pSliceCodedEvent = (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pSliceCodedEvent"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pSliceCodedEvent), FreeMemorySvc (ppCtx)) pSmt->pReadySliceCodingEvent = (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pReadySliceCodingEvent"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pReadySliceCodingEvent), FreeMemorySvc (ppCtx)) pSmt->pFinSliceCodingEvent = (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pFinSliceCodingEvent"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pFinSliceCodingEvent), FreeMemorySvc (ppCtx)) #endif//_WIN32 #if defined(__GNUC__) pSmt->pUpdateMbListThrdHandles = (WELS_THREAD_HANDLE*)pMa->WelsMalloc (sizeof (WELS_THREAD_HANDLE) * iThreadNum, "pUpdateMbListThrdHandles"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pUpdateMbListThrdHandles), FreeMemorySvc (ppCtx)) #endif//__GNUC__ #ifdef _WIN32 pSmt->pUpdateMbListEvent = (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pUpdateMbListEvent"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pUpdateMbListEvent), FreeMemorySvc (ppCtx)) pSmt->pFinUpdateMbListEvent = (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pFinUpdateMbListEvent"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pFinUpdateMbListEvent), FreeMemorySvc (ppCtx)) #endif//_WIN32 #ifdef _WIN32 pSmt->pExitEncodeEvent = (WELS_EVENT*)pMa->WelsMalloc (sizeof (WELS_EVENT) * iThreadNum, "pExitEncodeEvent"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pExitEncodeEvent), FreeMemorySvc (ppCtx)) #endif//_WIN32 iIdx = 0; while (iIdx < iNumSpatialLayers) { SSliceConfig* pMso = &pPara->sDependencyLayers[iIdx].sSliceCfg; const int32_t kiSliceNum = pMso->sSliceArgument.uiSliceNum; if (pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1 && pPara->iMultipleThreadIdc >= kiSliceNum) { pSmt->pSliceConsumeTime[iIdx] = (uint32_t*)pMa->WelsMallocz (kiSliceNum * sizeof (uint32_t), "pSliceConsumeTime[]"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pSliceConsumeTime[iIdx]), FreeMemorySvc (ppCtx)) pSmt->pSliceComplexRatio[iIdx] = (float*)pMa->WelsMalloc (kiSliceNum * sizeof (float), "pSliceComplexRatio[]"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSmt->pSliceComplexRatio[iIdx]), FreeMemorySvc (ppCtx)) } else { pSmt->pSliceConsumeTime[iIdx] = NULL; pSmt->pSliceComplexRatio[iIdx] = NULL; } ++ iIdx; } // NULL for pSliceConsumeTime[iIdx]: iIdx from iNumSpatialLayers to MAX_DEPENDENCY_LAYERS #ifdef MT_DEBUG // file handle for MT debug pSmt->pFSliceDiff = NULL; if (pSmt->pFSliceDiff) { fclose (pSmt->pFSliceDiff); pSmt->pFSliceDiff = NULL; } pSmt->pFSliceDiff = fopen ("slice_time.txt", "wt+"); #endif//MT_DEBUG #if defined(ENABLE_TRACE_MT) WelsLog ((*ppCtx), WELS_LOG_INFO, "encpEncCtx= 0x%p\n", (void*) (*ppCtx)); #endif//ENABLE_TRACE_MT iIdx = 0; while (iIdx < iThreadNum) { #if defined(__GNUC__) && !defined(_WIN32) // for posix threading char name[SEM_NAME_MAX] = {0}; WELS_THREAD_ERROR_CODE err = 0; #endif//__GNUC__ pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx = (void*) (*ppCtx); pSmt->pThreadPEncCtx[iIdx].iSliceIndex = iIdx; pSmt->pThreadPEncCtx[iIdx].iThreadIndex = iIdx; pSmt->pThreadHandles[iIdx] = 0; #ifdef _WIN32 WelsEventInit (&pSmt->pUpdateMbListEvent[iIdx]); WelsEventInit (&pSmt->pFinUpdateMbListEvent[iIdx]); #else // length of semaphore name should be system constrained at least on mac 10.7 WelsSnprintf (name, SEM_NAME_MAX, "ud%d%p", iIdx, (void*) (*ppCtx)); err = WelsEventOpen (&pSmt->pUpdateMbListEvent[iIdx], name); #if defined(ENABLE_TRACE_MT) WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno); #endif WelsSnprintf (name, SEM_NAME_MAX, "fu%d%p", iIdx, (void*) (*ppCtx)); err = WelsEventOpen (&pSmt->pFinUpdateMbListEvent[iIdx], name); #if defined(ENABLE_TRACE_MT) WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pFinUpdateMbListEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno); #endif #endif//_WIN32 #ifdef _WIN32 WelsEventInit (&pSmt->pSliceCodedEvent[iIdx]); WelsEventInit (&pSmt->pReadySliceCodingEvent[iIdx]); WelsEventInit (&pSmt->pFinSliceCodingEvent[iIdx]); WelsEventInit (&pSmt->pExitEncodeEvent[iIdx]); #else WelsSnprintf (name, SEM_NAME_MAX, "sc%d%p", iIdx, (void*) (*ppCtx)); err = WelsEventOpen (&pSmt->pSliceCodedEvent[iIdx], name); #if defined(ENABLE_TRACE_MT) WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pSliceCodedEvent%d named(%s) ret%d err%d\n", iIdx, name, err, errno); #endif WelsSnprintf (name, SEM_NAME_MAX, "rc%d%p", iIdx, (void*) (*ppCtx)); err = WelsEventOpen (&pSmt->pReadySliceCodingEvent[iIdx], name); #if defined(ENABLE_TRACE_MT) WelsLog ((*ppCtx), WELS_LOG_INFO, "[MT] Open pReadySliceCodingEvent%d = 0x%p named(%s) ret%d err%d\n", iIdx, (void*)pSmt->pReadySliceCodingEvent[iIdx], (void*) (*ppCtx), err, errno); #endif #endif//_WIN32 ++ iIdx; } (*ppCtx)->pSliceBs = (SWelsSliceBs*)pMa->WelsMalloc (sizeof (SWelsSliceBs) * iMaxSliceNum, "pSliceBs"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSliceBs), FreeMemorySvc (ppCtx)) pBsBase = (*ppCtx)->pFrameBs + iCountBsLen; pSliceB = (*ppCtx)->pSliceBs; iSliceBsBufferSize = iTargetSpatialBsSize; iIdx = 0; while (iIdx < iMaxSliceNum) { pSliceB->pBsBuffer = (uint8_t*)pMa->WelsMalloc (iSliceBsBufferSize, "pSliceB->pBsBuffer"); WELS_VERIFY_RETURN_PROC_IF (1, (NULL == pSliceB->pBsBuffer), FreeMemorySvc (ppCtx)) pSliceB->uiSize = iSliceBsBufferSize; if (iIdx > 0) { pSliceB->pBs = pBsBase; pSliceB->uiBsPos = 0; pBsBase += iSliceBsBufferSize; } else { pSliceB->pBs = NULL; pSliceB->uiBsPos = 0; } ++ pSliceB; ++ iIdx; } iReturn = WelsMutexInit (&pSmt->mutexSliceNumUpdate); WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx)) iReturn = WelsMutexInit (&(*ppCtx)->mutexEncoderError); WELS_VERIFY_RETURN_PROC_IF (1, (WELS_THREAD_ERROR_OK != iReturn), FreeMemorySvc (ppCtx)) #if defined(ENABLE_TRACE_MT) WelsLog ((*ppCtx), WELS_LOG_INFO, "RequestMtResource(), iThreadNum=%d, iCountSliceNum= %d\n", pPara->iCountThreadsNum, iMaxSliceNum); #endif return 0; } void ReleaseMtResource (sWelsEncCtx** ppCtx) { SWelsSliceBs* pSliceB = NULL; SWelsSvcCodingParam* pCodingParam = NULL; SSliceThreading* pSmt = NULL; CMemoryAlign* pMa = NULL; int32_t iIdx = 0; int32_t iThreadNum = 0; int16_t uiSliceNum = 0; if (NULL == ppCtx || NULL == *ppCtx) return; pMa = (*ppCtx)->pMemAlign; pCodingParam = (*ppCtx)->pSvcParam; uiSliceNum = (*ppCtx)->iMaxSliceCount; iThreadNum = (*ppCtx)->pSvcParam->iCountThreadsNum; pSmt = (*ppCtx)->pSliceThreading; if (NULL == pSmt) return; while (iIdx < iThreadNum) { #ifdef _WIN32 if (pSmt->pThreadHandles != NULL && pSmt->pThreadHandles[iIdx] != NULL) WelsThreadDestroy (&pSmt->pThreadHandles[iIdx]); if (pSmt->pSliceCodedEvent != NULL) WelsEventDestroy (&pSmt->pSliceCodedEvent[iIdx]); if (pSmt->pReadySliceCodingEvent != NULL) WelsEventDestroy (&pSmt->pReadySliceCodingEvent[iIdx]); if (pSmt->pFinSliceCodingEvent != NULL) WelsEventDestroy (&pSmt->pFinSliceCodingEvent[iIdx]); if (pSmt->pExitEncodeEvent != NULL) WelsEventDestroy (&pSmt->pExitEncodeEvent[iIdx]); if (pSmt->pUpdateMbListEvent != NULL) WelsEventDestroy (&pSmt->pUpdateMbListEvent[iIdx]); if (pSmt->pFinUpdateMbListEvent != NULL) WelsEventDestroy (&pSmt->pFinUpdateMbListEvent[iIdx]); #else char ename[SEM_NAME_MAX] = {0}; // length of semaphore name should be system constrained at least on mac 10.7 WelsSnprintf (ename, SEM_NAME_MAX, "sc%d%p", iIdx, (void*) (*ppCtx)); WelsEventClose (pSmt->pSliceCodedEvent[iIdx], ename); WelsSnprintf (ename, SEM_NAME_MAX, "rc%d%p", iIdx, (void*) (*ppCtx)); WelsEventClose (pSmt->pReadySliceCodingEvent[iIdx], ename); WelsSnprintf (ename, SEM_NAME_MAX, "ud%d%p", iIdx, (void*) (*ppCtx)); WelsEventClose (pSmt->pUpdateMbListEvent[iIdx], ename); WelsSnprintf (ename, SEM_NAME_MAX, "fu%d%p", iIdx, (void*) (*ppCtx)); WelsEventClose (pSmt->pFinUpdateMbListEvent[iIdx], ename); #endif//_WIN32 ++ iIdx; } #ifdef _WIN32 if (pSmt->pExitEncodeEvent != NULL) { pMa->WelsFree (pSmt->pExitEncodeEvent, "pExitEncodeEvent"); pSmt->pExitEncodeEvent = NULL; } if (pSmt->pSliceCodedEvent != NULL) { pMa->WelsFree (pSmt->pSliceCodedEvent, "pSliceCodedEvent"); pSmt->pSliceCodedEvent = NULL; } if (pSmt->pReadySliceCodingEvent != NULL) { pMa->WelsFree (pSmt->pReadySliceCodingEvent, "pReadySliceCodingEvent"); pSmt->pReadySliceCodingEvent = NULL; } if (pSmt->pFinSliceCodingEvent != NULL) { pMa->WelsFree (pSmt->pFinSliceCodingEvent, "pFinSliceCodingEvent"); pSmt->pFinSliceCodingEvent = NULL; } #endif//_WIN32 WelsMutexDestroy (&pSmt->mutexSliceNumUpdate); WelsMutexDestroy (&((*ppCtx)->mutexEncoderError)); if (pSmt->pThreadPEncCtx != NULL) { pMa->WelsFree (pSmt->pThreadPEncCtx, "pThreadPEncCtx"); pSmt->pThreadPEncCtx = NULL; } if (pSmt->pThreadHandles != NULL) { pMa->WelsFree (pSmt->pThreadHandles, "pThreadHandles"); pSmt->pThreadHandles = NULL; } pSliceB = (*ppCtx)->pSliceBs; iIdx = 0; while (pSliceB != NULL && iIdx < uiSliceNum) { if (pSliceB->pBsBuffer) { pMa->WelsFree (pSliceB->pBsBuffer, "pSliceB->pBsBuffer"); pSliceB->pBsBuffer = NULL; pSliceB->uiSize = 0; } ++ iIdx; ++ pSliceB; } if ((*ppCtx)->pSliceBs != NULL) { pMa->WelsFree ((*ppCtx)->pSliceBs, "pSliceBs"); (*ppCtx)->pSliceBs = NULL; } iIdx = 0; while (iIdx < pCodingParam->iSpatialLayerNum) { if (pSmt->pSliceConsumeTime[iIdx]) { pMa->WelsFree (pSmt->pSliceConsumeTime[iIdx], "pSliceConsumeTime[]"); pSmt->pSliceConsumeTime[iIdx] = NULL; } if (pSmt->pSliceComplexRatio[iIdx] != NULL) { pMa->WelsFree (pSmt->pSliceComplexRatio[iIdx], "pSliceComplexRatio[]"); pSmt->pSliceComplexRatio[iIdx] = NULL; } ++ iIdx; } #ifdef _WIN32 if (pSmt->pUpdateMbListEvent != NULL) { pMa->WelsFree (pSmt->pUpdateMbListEvent, "pUpdateMbListEvent"); pSmt->pUpdateMbListEvent = NULL; } if (pSmt->pFinUpdateMbListEvent != NULL) { pMa->WelsFree (pSmt->pFinUpdateMbListEvent, "pFinUpdateMbListEvent"); pSmt->pFinUpdateMbListEvent = NULL; } #else if (pSmt->pUpdateMbListThrdHandles) { pMa->WelsFree (pSmt->pUpdateMbListThrdHandles, "pUpdateMbListThrdHandles"); pSmt->pUpdateMbListThrdHandles = NULL; } #endif//_WIN32 #ifdef MT_DEBUG // file handle for debug if (pSmt->pFSliceDiff) { fclose (pSmt->pFSliceDiff); pSmt->pFSliceDiff = NULL; } #endif//MT_DEBUG pMa->WelsFree ((*ppCtx)->pSliceThreading, "SSliceThreading"); (*ppCtx)->pSliceThreading = NULL; } int32_t AppendSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, const int32_t iSliceCount) { SWelsSvcCodingParam* pCodingParam = pCtx->pSvcParam; SDLayerParam* pDlp = &pCodingParam->sDependencyLayers[pCtx->uiDependencyId]; SWelsSliceBs* pSliceBs = NULL; const bool kbIsDynamicSlicingMode = (pDlp->sSliceCfg.uiSliceMode == SM_DYN_SLICE); int32_t iLayerSize = 0; int32_t iNalIdxBase = pLbi->iNalCount; int32_t iSliceIdx = 0; if (!kbIsDynamicSlicingMode) { pSliceBs = &pCtx->pSliceBs[0]; iLayerSize = pSliceBs->uiBsPos; // assign with base pSlice first iSliceIdx = 1; // pSlice 0 bs has been written to pFrameBs yet by now, so uiSliceIdx base should be 1 while (iSliceIdx < iSliceCount) { ++ pSliceBs; if (pSliceBs != NULL && pSliceBs->uiBsPos > 0) { int32_t iNalIdx = 0; const int32_t iCountNal = pSliceBs->iNalIndex; #if MT_DEBUG_BS_WR assert (pSliceBs->bSliceCodedFlag); #endif//MT_DEBUG_BS_WR memmove (pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos); // confirmed_safe_unsafe_usage pCtx->iPosBsBuffer += pSliceBs->uiBsPos; iLayerSize += pSliceBs->uiBsPos; while (iNalIdx < iCountNal) { pLbi->iNalLengthInByte[iNalIdxBase + iNalIdx] = pSliceBs->iNalLen[iNalIdx]; ++ iNalIdx; } pLbi->iNalCount += iCountNal; iNalIdxBase += iCountNal; } ++ iSliceIdx; } } else { // for SM_DYN_SLICE const int32_t kiPartitionCnt = iSliceCount; int32_t iPartitionIdx = 0; // due partition_0 has been written to pFrameBsBuffer // so iLayerSize need add it while (iPartitionIdx < kiPartitionCnt) { const int32_t kiCountSlicesCoded = pCtx->pCurDqLayer->pNumSliceCodedOfPartition[iPartitionIdx]; int32_t iIdx = 0; iSliceIdx = iPartitionIdx; while (iIdx < kiCountSlicesCoded) { pSliceBs = &pCtx->pSliceBs[iSliceIdx]; if (pSliceBs != NULL && pSliceBs->uiBsPos > 0) { if (iPartitionIdx > 0) { int32_t iNalIdx = 0; const int32_t iCountNal = pSliceBs->iNalIndex; memmove (pCtx->pFrameBs + pCtx->iPosBsBuffer, pSliceBs->pBs, pSliceBs->uiBsPos); // confirmed_safe_unsafe_usage pCtx->iPosBsBuffer += pSliceBs->uiBsPos; iLayerSize += pSliceBs->uiBsPos; while (iNalIdx < iCountNal) { pLbi->iNalLengthInByte[iNalIdxBase + iNalIdx] = pSliceBs->iNalLen[iNalIdx]; ++ iNalIdx; } pLbi->iNalCount += iCountNal; iNalIdxBase += iCountNal; } else { iLayerSize += pSliceBs->uiBsPos; } } iSliceIdx += kiPartitionCnt; ++ iIdx; } ++ iPartitionIdx; } } return iLayerSize; } int32_t WriteSliceToFrameBs (sWelsEncCtx* pCtx, SLayerBSInfo* pLbi, uint8_t* pFrameBsBuffer, const int32_t iSliceIdx, int32_t& iSliceSize) { SWelsSliceBs* pSliceBs = &pCtx->pSliceBs[iSliceIdx]; SNalUnitHeaderExt* pNalHdrExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt; uint8_t* pDst = pFrameBsBuffer; int32_t pNalLen[2]; const int32_t kiNalCnt = pSliceBs->iNalIndex; int32_t iNalIdx = 0; int32_t iNalSize = 0; const int32_t iFirstSlice = (iSliceIdx == 0); int32_t iNalBase = iFirstSlice ? 0 : pLbi->iNalCount; int32_t iReturn = ENC_RETURN_SUCCESS; const int32_t kiWrittenLength = pCtx->iPosBsBuffer; iSliceSize = 0; while (iNalIdx < kiNalCnt) { iNalSize = 0; iReturn = WelsEncodeNal (&pSliceBs->sNalList[iNalIdx], pNalHdrExt, pCtx->iFrameBsSize-kiWrittenLength-iSliceSize, pDst, &iNalSize); WELS_VERIFY_RETURN_IFNEQ(iReturn, ENC_RETURN_SUCCESS) pNalLen[iNalIdx] = iNalSize; iSliceSize += iNalSize; pDst += iNalSize; pLbi->iNalLengthInByte[iNalBase + iNalIdx] = iNalSize; ++ iNalIdx; } pSliceBs->uiBsPos = iSliceSize; if (iFirstSlice) { // pBsBuffer has been updated at coding_slice_0_in_encoder_mother_thread() pLbi->uiLayerType = VIDEO_CODING_LAYER; pLbi->uiSpatialId = pNalHdrExt->uiDependencyId; pLbi->uiTemporalId = pNalHdrExt->uiTemporalId; pLbi->uiQualityId = 0; pLbi->uiPriorityId = 0; pLbi->iNalCount = kiNalCnt; } else { pLbi->iNalCount += kiNalCnt; } return ENC_RETURN_SUCCESS; } int32_t WriteSliceBs (sWelsEncCtx* pCtx, uint8_t* pSliceBsBuf, const int32_t iSliceIdx, int32_t& iSliceSize) { SWelsSliceBs* pSliceBs = &pCtx->pSliceBs[iSliceIdx]; SNalUnitHeaderExt* pNalHdrExt = &pCtx->pCurDqLayer->sLayerInfo.sNalHeaderExt; uint8_t* pDst = pSliceBsBuf; int32_t* pNalLen = &pSliceBs->iNalLen[0]; const int32_t kiNalCnt = pSliceBs->iNalIndex; int32_t iNalIdx = 0; int32_t iNalSize = 0; int32_t iReturn = ENC_RETURN_SUCCESS; const int32_t kiWrittenLength = pSliceBs->sBsWrite.pBufPtr - pSliceBs->sBsWrite.pBuf; iSliceSize = 0; assert (kiNalCnt <= 2); if (kiNalCnt > 2) return 0; while (iNalIdx < kiNalCnt) { iNalSize = 0; iReturn = WelsEncodeNal (&pSliceBs->sNalList[iNalIdx], pNalHdrExt, pSliceBs->uiSize-kiWrittenLength-iSliceSize, pDst, &iNalSize); WELS_VERIFY_RETURN_IFNEQ(iReturn, ENC_RETURN_SUCCESS) pNalLen[iNalIdx] = iNalSize; iSliceSize += iNalSize; pDst += iNalSize; ++ iNalIdx; } pSliceBs->uiBsPos = iSliceSize; return iReturn; } #if defined(__GNUC__) && !defined(_WIN32) WELS_THREAD_ROUTINE_TYPE UpdateMbListThreadProc (void* arg) { SSliceThreadPrivateData* pPrivateData = (SSliceThreadPrivateData*)arg; sWelsEncCtx* pEncPEncCtx = NULL; SDqLayer* pCurDq = NULL; int32_t iSliceIdx = -1; int32_t iEventIdx = -1; WELS_THREAD_ERROR_CODE iWaitRet = WELS_THREAD_ERROR_GENERAL; uint32_t uiThrdRet = 0; if (NULL == pPrivateData) WELS_THREAD_ROUTINE_RETURN (1); pEncPEncCtx = (sWelsEncCtx*)pPrivateData->pWelsPEncCtx; iSliceIdx = pPrivateData->iSliceIndex; iEventIdx = pPrivateData->iThreadIndex; do { #if defined(ENABLE_TRACE_MT) WelsLog (pEncPEncCtx, WELS_LOG_INFO, "[MT] UpdateMbListThreadProc(), try to wait (pUpdateMbListEvent[%d])!\n", iEventIdx); #endif iWaitRet = WelsEventWait (pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx]); if (WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet) { pCurDq = pEncPEncCtx->pCurDqLayer; UpdateMbListNeighborParallel (pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx); WelsEventSignal ( pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx]); // mean finished update pMb list for this pSlice } else { WelsLog (pEncPEncCtx, WELS_LOG_WARNING, "[MT] UpdateMbListThreadProc(), waiting pUpdateMbListEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx, iWaitRet, iEventIdx); uiThrdRet = 1; break; } } while (1); WELS_THREAD_ROUTINE_RETURN (uiThrdRet); } #endif//__GNUC__ // thread process for coding one pSlice WELS_THREAD_ROUTINE_TYPE CodingSliceThreadProc (void* arg) { SSliceThreadPrivateData* pPrivateData = (SSliceThreadPrivateData*)arg; sWelsEncCtx* pEncPEncCtx = NULL; SDqLayer* pCurDq = NULL; SSlice* pSlice = NULL; SWelsSliceBs* pSliceBs = NULL; #ifdef _WIN32 WELS_EVENT pEventsList[3]; int32_t iEventCount = 0; #endif WELS_THREAD_ERROR_CODE iWaitRet = WELS_THREAD_ERROR_GENERAL; uint32_t uiThrdRet = 0; int32_t iSliceSize = 0; int32_t iSliceIdx = -1; int32_t iThreadIdx = -1; int32_t iEventIdx = -1; bool bNeedPrefix = false; EWelsNalUnitType eNalType = NAL_UNIT_UNSPEC_0; EWelsNalRefIdc eNalRefIdc = NRI_PRI_LOWEST; int32_t iReturn = ENC_RETURN_SUCCESS; if (NULL == pPrivateData) WELS_THREAD_ROUTINE_RETURN (1); WelsSetThreadCancelable(); pEncPEncCtx = (sWelsEncCtx*)pPrivateData->pWelsPEncCtx; iThreadIdx = pPrivateData->iThreadIndex; iEventIdx = iThreadIdx; #ifdef _WIN32 pEventsList[iEventCount++] = pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx]; pEventsList[iEventCount++] = pEncPEncCtx->pSliceThreading->pExitEncodeEvent[iEventIdx]; pEventsList[iEventCount++] = pEncPEncCtx->pSliceThreading->pUpdateMbListEvent[iEventIdx]; #endif//_WIN32 do { #ifdef _WIN32 iWaitRet = WelsMultipleEventsWaitSingleBlocking (iEventCount, &pEventsList[0], (uint32_t) - 1); // blocking until at least one event is #else #if defined(ENABLE_TRACE_MT) WelsLog (pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), try to call WelsEventWait(pReadySliceCodingEvent[%d]= 0x%p), pEncPEncCtx= 0x%p!\n", iEventIdx, (void*) (pEncPEncCtx->pReadySliceCodingEvent[iEventIdx]), (void*)pEncPEncCtx); #endif iWaitRet = WelsEventWait (pEncPEncCtx->pSliceThreading->pReadySliceCodingEvent[iEventIdx]); #endif//WIN32 if (WELS_THREAD_ERROR_WAIT_OBJECT_0 == iWaitRet) { // start pSlice coding signal waited SLayerBSInfo* pLbi = pPrivateData->pLayerBs; const int32_t kiCurDid = pEncPEncCtx->uiDependencyId; const int32_t kiCurTid = pEncPEncCtx->uiTemporalId; SWelsSvcCodingParam* pCodingParam = pEncPEncCtx->pSvcParam; SDLayerParam* pParamD = &pCodingParam->sDependencyLayers[kiCurDid]; pCurDq = pEncPEncCtx->pCurDqLayer; eNalType = pEncPEncCtx->eNalType; eNalRefIdc = pEncPEncCtx->eNalPriority; bNeedPrefix = pEncPEncCtx->bNeedPrefixNalFlag; if (pParamD->sSliceCfg.uiSliceMode != SM_DYN_SLICE) { int64_t iSliceStart = 0; bool bDsaFlag = false; iSliceIdx = pPrivateData->iSliceIndex; pSlice = &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx]; pSliceBs = &pEncPEncCtx->pSliceBs[iSliceIdx]; bDsaFlag = (pParamD->sSliceCfg.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pCodingParam->iMultipleThreadIdc > 1 && pCodingParam->iMultipleThreadIdc >= pParamD->sSliceCfg.sSliceArgument.uiSliceNum); if (bDsaFlag) iSliceStart = WelsTime(); pSliceBs->uiBsPos = 0; pSliceBs->iNalIndex = 0; assert ((void*) (&pSliceBs->sBsWrite) == (void*)pSlice->pSliceBsa); InitBits (&pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize); #if MT_DEBUG_BS_WR pSliceBs->bSliceCodedFlag = false; #endif//MT_DEBUG_BS_WR if (bNeedPrefix) { if (eNalRefIdc != NRI_PRI_LOWEST) { WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc); WelsWriteSVCPrefixNal (&pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType)); WelsUnloadNalForSlice (pSliceBs); } else { // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc); // No need write any syntax of prefix NAL Unit RBSP here WelsUnloadNalForSlice (pSliceBs); } } WelsLoadNalForSlice (pSliceBs, eNalType, eNalRefIdc); iReturn = WelsCodeOneSlice (pEncPEncCtx, iSliceIdx, eNalType); if (ENC_RETURN_SUCCESS!=iReturn) { uiThrdRet = iReturn; break; } WelsUnloadNalForSlice (pSliceBs); if (0 == iSliceIdx) { pLbi->pBsBuf = pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer; iReturn = WriteSliceToFrameBs (pEncPEncCtx, pLbi, pLbi->pBsBuf, iSliceIdx, iSliceSize); if (ENC_RETURN_SUCCESS!=iReturn) { uiThrdRet = iReturn; break; } pEncPEncCtx->iPosBsBuffer += iSliceSize; } else { iReturn = WriteSliceBs (pEncPEncCtx, pSliceBs->pBs, iSliceIdx, iSliceSize); if (ENC_RETURN_SUCCESS!=iReturn) { uiThrdRet = iReturn; break; } } if (pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1 #if !defined(ENABLE_FRAME_DUMP) && (eNalRefIdc != NRI_PRI_LOWEST) && (pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId) #endif// !ENABLE_FRAME_DUMP ) { DeblockingFilterSliceAvcbase (pCurDq, pEncPEncCtx->pFuncList, iSliceIdx); } if (bDsaFlag) { pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx] = (uint32_t) ( WelsTime() - iSliceStart); #if defined(ENABLE_TRACE_MT) WelsLog (pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), coding_idx %d, uiSliceIdx %d, pSliceConsumeTime %d, iSliceSize %d, pFirstMbInSlice %d, count_num_mb_in_slice %d\n", pEncPEncCtx->iCodingIndex, iSliceIdx, pEncPEncCtx->pSliceThreading->pSliceConsumeTime[pEncPEncCtx->uiDependencyId][iSliceIdx], iSliceSize, pCurDq->pSliceEncCtx->pFirstMbInSlice[iSliceIdx], pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx]); #endif//ENABLE_TRACE_MT } #if defined(SLICE_INFO_OUTPUT) fprintf (stderr, "@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n", iSliceIdx, (pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'), eNalRefIdc, iSliceSize ); #endif//SLICE_INFO_OUTPUT #if MT_DEBUG_BS_WR pSliceBs->bSliceCodedFlag = true; #endif//MT_DEBUG_BS_WR #ifdef _WIN32 WelsEventSignal ( &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]); // mean finished coding current pSlice #else WelsEventSignal (pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]); // mean finished coding current pSlice #endif//WIN32 } else { // for SM_DYN_SLICE parallelization SSliceCtx* pSliceCtx = pCurDq->pSliceEncCtx; const int32_t kiPartitionId = iThreadIdx; const int32_t kiSliceIdxStep = pEncPEncCtx->iActiveThreadsNum; const int32_t kiFirstMbInPartition = pPrivateData->iStartMbIndex; // inclusive const int32_t kiEndMbInPartition = pPrivateData->iEndMbIndex; // exclusive int32_t iAnyMbLeftInPartition = kiEndMbInPartition - kiFirstMbInPartition; iSliceIdx = pPrivateData->iSliceIndex; pSliceCtx->pFirstMbInSlice[iSliceIdx] = kiFirstMbInPartition; pCurDq->pNumSliceCodedOfPartition[kiPartitionId] = 1; // one pSlice per partition intialized, dynamic slicing inside pCurDq->pLastMbIdxOfPartition[kiPartitionId] = kiEndMbInPartition - 1; pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId] = 0; while (iAnyMbLeftInPartition > 0) { if (iSliceIdx >= pSliceCtx->iMaxSliceNumConstraint) { // TODO: need exception handler for not large enough of MAX_SLICES_NUM related memory usage // No idea about its solution due MAX_SLICES_NUM is fixed lenght in relevent pData structure uiThrdRet = 1; break; } pSlice = &pCurDq->sLayerInfo.pSliceInLayer[iSliceIdx]; pSliceBs = &pEncPEncCtx->pSliceBs[iSliceIdx]; pSliceBs->uiBsPos = 0; pSliceBs->iNalIndex = 0; InitBits (&pSliceBs->sBsWrite, pSliceBs->pBsBuffer, pSliceBs->uiSize); if (bNeedPrefix) { if (eNalRefIdc != NRI_PRI_LOWEST) { WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc); WelsWriteSVCPrefixNal (&pSliceBs->sBsWrite, eNalRefIdc, (NAL_UNIT_CODED_SLICE_IDR == eNalType)); WelsUnloadNalForSlice (pSliceBs); } else { // No Prefix NAL Unit RBSP syntax here, but need add NAL Unit Header extension WelsLoadNalForSlice (pSliceBs, NAL_UNIT_PREFIX, eNalRefIdc); // No need write any syntax of prefix NAL Unit RBSP here WelsUnloadNalForSlice (pSliceBs); } } WelsLoadNalForSlice (pSliceBs, eNalType, eNalRefIdc); iReturn = WelsCodeOneSlice (pEncPEncCtx, iSliceIdx, eNalType); if (ENC_RETURN_SUCCESS!=iReturn) { uiThrdRet = iReturn; break; } WelsUnloadNalForSlice (pSliceBs); if (0 == kiPartitionId) { if (0 == iSliceIdx) pLbi->pBsBuf = pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer; iReturn = WriteSliceToFrameBs (pEncPEncCtx, pLbi, pEncPEncCtx->pFrameBs + pEncPEncCtx->iPosBsBuffer, iSliceIdx, iSliceSize); if (ENC_RETURN_SUCCESS!=iReturn) { uiThrdRet = iReturn; break; } pEncPEncCtx->iPosBsBuffer += iSliceSize; } else { iSliceSize = WriteSliceBs (pEncPEncCtx, pSliceBs->pBs, iSliceIdx, iSliceSize); if (ENC_RETURN_SUCCESS!=iReturn) { uiThrdRet = iReturn; break; } } if (pCurDq->bDeblockingParallelFlag && pSlice->sSliceHeaderExt.sSliceHeader.uiDisableDeblockingFilterIdc != 1 #if !defined(ENABLE_FRAME_DUMP) && (eNalRefIdc != NRI_PRI_LOWEST) && (pParamD->iHighestTemporalId == 0 || kiCurTid < pParamD->iHighestTemporalId) #endif// !ENABLE_FRAME_DUMP ) { DeblockingFilterSliceAvcbase (pCurDq, pEncPEncCtx->pFuncList, iSliceIdx); } #if defined(SLICE_INFO_OUTPUT) fprintf (stderr, "@pSlice=%-6d sliceType:%c idc:%d size:%-6d\n", iSliceIdx, (pEncPEncCtx->eSliceType == P_SLICE ? 'P' : 'I'), eNalRefIdc, iSliceSize ); #endif//SLICE_INFO_OUTPUT #if defined(ENABLE_TRACE_MT) WelsLog (pEncPEncCtx, WELS_LOG_INFO, "[MT] CodingSliceThreadProc(), coding_idx %d, iPartitionId %d, uiSliceIdx %d, iSliceSize %d, count_mb_slice %d, iEndMbInPartition %d, pCurDq->pLastCodedMbIdxOfPartition[%d] %d\n", pEncPEncCtx->iCodingIndex, kiPartitionId, iSliceIdx, iSliceSize, pCurDq->pSliceEncCtx->pCountMbNumInSlice[iSliceIdx], kiEndMbInPartition, kiPartitionId, pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]); #endif//ENABLE_TRACE_MT iAnyMbLeftInPartition = kiEndMbInPartition - (1 + pCurDq->pLastCodedMbIdxOfPartition[kiPartitionId]); iSliceIdx += kiSliceIdxStep; } if (uiThrdRet) // any exception?? break; #ifdef _WIN32 WelsEventSignal (&pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]); // mean finished coding current pSlice #else WelsEventSignal (pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]); // mean finished coding current pSlice #endif//WIN32 } } #ifdef _WIN32 else if (WELS_THREAD_ERROR_WAIT_OBJECT_0 + 1 == iWaitRet) { // exit thread signal uiThrdRet = 0; break; } else if (WELS_THREAD_ERROR_WAIT_OBJECT_0 + 2 == iWaitRet) { // update pMb list singal iSliceIdx = iEventIdx; // pPrivateData->iSliceIndex; old threads can not be terminated, pPrivateData is not correct for applicable pCurDq = pEncPEncCtx->pCurDqLayer; UpdateMbListNeighborParallel (pCurDq->pSliceEncCtx, pCurDq->sMbDataP, iSliceIdx); WelsEventSignal ( &pEncPEncCtx->pSliceThreading->pFinUpdateMbListEvent[iEventIdx]); // mean finished update pMb list for this pSlice } #endif//WIN32 else { // WELS_THREAD_ERROR_WAIT_TIMEOUT, or WELS_THREAD_ERROR_WAIT_FAILED WelsLog (pEncPEncCtx, WELS_LOG_WARNING, "[MT] CodingSliceThreadProc(), waiting pReadySliceCodingEvent[%d] failed(%d) and thread%d terminated!\n", iEventIdx, iWaitRet, iThreadIdx); uiThrdRet = 1; break; } } while (1); #ifdef _WIN32 WelsEventSignal (&pEncPEncCtx->pSliceThreading->pFinSliceCodingEvent[iEventIdx]); // notify to mother encoding threading #endif//WIN32 //sync multi-threading error WelsMutexLock (&pEncPEncCtx->mutexEncoderError); if (uiThrdRet) pEncPEncCtx->iEncoderError |= uiThrdRet; WelsMutexUnlock (&pEncPEncCtx->mutexEncoderError); WELS_THREAD_ROUTINE_RETURN (uiThrdRet); } int32_t CreateSliceThreads (sWelsEncCtx* pCtx) { const int32_t kiThreadCount = pCtx->pSvcParam->iCountThreadsNum; int32_t iIdx = 0; #if defined(_WIN32) && defined(BIND_CPU_CORES_TO_THREADS) DWORD dwProcessAffinity; DWORD dwSystemAffinity; GetProcessAffinityMask (GetCurrentProcess(), &dwProcessAffinity, &dwSystemAffinity); #endif//WIN32 && BIND_CPU_CORES_TO_THREADS while (iIdx < kiThreadCount) { WelsThreadCreate (&pCtx->pSliceThreading->pThreadHandles[iIdx], CodingSliceThreadProc, &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0); #if defined(_WIN32) && defined(BIND_CPU_CORES_TO_THREADS) if (dwProcessAffinity > 1 && pCtx->pSliceThreading->pThreadHandles[iIdx] != NULL) { // multiple cores and thread created successfully DWORD dw = 0; DWORD dwAffinityMask = 1 << iIdx; if (dwAffinityMask & dwProcessAffinity) { // check if cpu is available dw = SetThreadAffinityMask (pCtx->pSliceThreading->pThreadHandles[iIdx], dwAffinityMask); //1 << iIdx if (dw == 0) { char str[64] = {0}; WelsSnprintf (str, 64, "SetThreadAffinityMask iIdx:%d", iIdx); } } } #endif//WIN32 && BIND_CPU_CORES_TO_THREADS // We need extra threads for update_mb_list_proc on __GNUC__ like OS (mac/linux) // due to WelsMultipleEventsWaitSingleBlocking implememtation can not work well // in case waiting pUpdateMbListEvent and pReadySliceCodingEvent events at the same time #if defined(__GNUC__) && !defined(_WIN32) WelsThreadCreate (&pCtx->pSliceThreading->pUpdateMbListThrdHandles[iIdx], UpdateMbListThreadProc, &pCtx->pSliceThreading->pThreadPEncCtx[iIdx], 0); #endif//__GNUC__ ++ iIdx; } #if defined(ENABLE_TRACE_MT) WelsLog (pCtx, WELS_LOG_INFO, "CreateSliceThreads() exit..\n"); #endif return 0; } #ifdef _WIN32 int32_t FiredSliceThreads (SSliceThreadPrivateData* pPriData, WELS_EVENT* pEventsList, SLayerBSInfo* pLbi, const uint32_t uiNumThreads, SSliceCtx* pSliceCtx, const bool bIsDynamicSlicingMode) #else int32_t FiredSliceThreads (SSliceThreadPrivateData* pPriData, WELS_EVENT** pEventsList, SLayerBSInfo* pLbi, const uint32_t uiNumThreads, SSliceCtx* pSliceCtx, const bool bIsDynamicSlicingMode) #endif//WIN32 { int32_t iEndMbIdx = 0; int32_t iIdx = 0; const int32_t kiEventCnt = uiNumThreads; if (pPriData == NULL || pLbi == NULL || kiEventCnt <= 0 || pEventsList == NULL) { WelsLog (NULL, WELS_LOG_ERROR, "FiredSliceThreads(), fail due pPriData == %p || pLbi == %p || iEventCnt(%d) <= 0 || pEventsList == %p!!\n", (void*)pPriData, (void*)pLbi, uiNumThreads, (void*)pEventsList); return 1; } //////////////////////////////////////// if (bIsDynamicSlicingMode) { iEndMbIdx = pSliceCtx->iMbNumInFrame; for (iIdx = kiEventCnt - 1; iIdx >= 0; --iIdx) { const int32_t iFirstMbIdx = pSliceCtx->pFirstMbInSlice[iIdx]; pPriData[iIdx].iStartMbIndex = iFirstMbIdx; pPriData[iIdx].iEndMbIndex = iEndMbIdx; iEndMbIdx = iFirstMbIdx; } } iIdx = 0; while (iIdx < kiEventCnt) { pPriData[iIdx].pLayerBs = pLbi; pPriData[iIdx].iSliceIndex = iIdx; #ifdef _WIN32 if (pEventsList[iIdx]) WelsEventSignal (&pEventsList[iIdx]); #else WelsEventSignal (pEventsList[iIdx]); #endif//WIN32 ++ iIdx; } return 0; } int32_t DynamicDetectCpuCores() { WelsLogicalProcessInfo info; WelsQueryLogicalProcessInfo (&info); return info.ProcessorCount; } #if defined(MT_ENABLED) int32_t AdjustBaseLayer (sWelsEncCtx* pCtx) { SDqLayer* pCurDq = pCtx->ppDqLayerList[0]; int32_t iNeedAdj = 1; #ifdef MT_DEBUG int64_t iT0 = WelsTime(); #endif//MT_DEBUG pCtx->pCurDqLayer = pCurDq; // do not need adjust due to not different at both slices of consumed time iNeedAdj = NeedDynamicAdjust (pCtx->pSliceThreading->pSliceConsumeTime[0], pCurDq->pSliceEncCtx->iSliceNumInFrame); if (iNeedAdj) DynamicAdjustSlicing (pCtx, pCurDq, pCtx->pSliceThreading->pSliceComplexRatio[0], 0); #ifdef MT_DEBUG iT0 = WelsTime() - iT0; if (pCtx->pSliceThreading->pFSliceDiff) { fprintf (pCtx->pSliceThreading->pFSliceDiff, #ifdef _WIN32 "%6I64d us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n", #else "%6lld us adjust time at base spatial layer, iNeedAdj %d, DynamicAdjustSlicing()\n", #endif//WIN32 iT0, iNeedAdj); } #endif//MT_DEBUG return iNeedAdj; } int32_t AdjustEnhanceLayer (sWelsEncCtx* pCtx, int32_t iCurDid) { #ifdef MT_DEBUG int64_t iT1 = WelsTime(); #endif//MT_DEBUG int32_t iNeedAdj = 1; // uiSliceMode of referencing spatial should be SM_FIXEDSLCNUM_SLICE // if using spatial base layer for complexity estimation const bool kbModelingFromSpatial = (pCtx->pCurDqLayer->pRefLayer != NULL && iCurDid > 0) && (pCtx->pSvcParam->sDependencyLayers[iCurDid - 1].sSliceCfg.uiSliceMode == SM_FIXEDSLCNUM_SLICE && pCtx->pSvcParam->iMultipleThreadIdc >= pCtx->pSvcParam->sDependencyLayers[iCurDid - 1].sSliceCfg.sSliceArgument.uiSliceNum); if (kbModelingFromSpatial) { // using spatial base layer for complexity estimation // do not need adjust due to not different at both slices of consumed time iNeedAdj = NeedDynamicAdjust (pCtx->pSliceThreading->pSliceConsumeTime[iCurDid - 1], pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame); if (iNeedAdj) DynamicAdjustSlicing (pCtx, pCtx->pCurDqLayer, pCtx->pSliceThreading->pSliceComplexRatio[iCurDid - 1], iCurDid ); } else { // use temporal layer for complexity estimation // do not need adjust due to not different at both slices of consumed time iNeedAdj = NeedDynamicAdjust (pCtx->pSliceThreading->pSliceConsumeTime[iCurDid], pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame); if (iNeedAdj) DynamicAdjustSlicing (pCtx, pCtx->pCurDqLayer, pCtx->pSliceThreading->pSliceComplexRatio[iCurDid], iCurDid ); } #ifdef MT_DEBUG iT1 = WelsTime() - iT1; if (pCtx->pSliceThreading->pFSliceDiff) { fprintf (pCtx->pSliceThreading->pFSliceDiff, #ifdef _WIN32 "%6I64d us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n", #else "%6lld us adjust time at spatial layer %d, iNeedAdj %d, DynamicAdjustSlicing()\n", #endif//WIN32 iT1, iCurDid, iNeedAdj); } #endif//MT_DEBUG return iNeedAdj; } #endif//#if defined(MT_ENABLED) #if defined(MT_ENABLED) #if defined(MT_DEBUG) void TrackSliceComplexities (sWelsEncCtx* pCtx, const int32_t iCurDid) { const int32_t kiCountSliceNum = pCtx->pCurDqLayer->pSliceEncCtx->iSliceNumInFrame; if (kiCountSliceNum > 0) { int32_t iSliceIdx = 0; do { fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6.3f complexity pRatio at iDid %d pSlice %d\n", pCtx->pSliceThreading->pSliceComplexRatio[iCurDid][iSliceIdx], iCurDid, iSliceIdx); ++ iSliceIdx; } while (iSliceIdx < kiCountSliceNum); } } #endif #if defined(MT_DEBUG) void TrackSliceConsumeTime (sWelsEncCtx* pCtx, int32_t* pDidList, const int32_t iSpatialNum) { SWelsSvcCodingParam* pPara = NULL; int32_t iSpatialIdx = 0; if (iSpatialNum > MAX_DEPENDENCY_LAYER) return; pPara = pCtx->pSvcParam; while (iSpatialIdx < iSpatialNum) { const int32_t kiDid = pDidList[iSpatialIdx]; SDLayerParam* pDlp = &pPara->sDependencyLayers[kiDid]; SSliceConfig* pMso = &pDlp->sSliceCfg; SDqLayer* pCurDq = pCtx->ppDqLayerList[kiDid]; SSliceCtx* pSliceCtx = pCurDq->pSliceEncCtx; const uint32_t kuiCountSliceNum = pSliceCtx->iSliceNumInFrame; if (pCtx->pSliceThreading) { if (pCtx->pSliceThreading->pFSliceDiff && pMso->uiSliceMode == SM_FIXEDSLCNUM_SLICE && pPara->iMultipleThreadIdc > 1 && pPara->iMultipleThreadIdc >= kuiCountSliceNum) { uint32_t i = 0; uint32_t uiMaxT = 0; int32_t iMaxI = 0; while (i < kuiCountSliceNum) { if (pCtx->pSliceThreading->pSliceConsumeTime[kiDid] != NULL) fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time coding_idx %d iDid %d pSlice %d\n", pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i], pCtx->iCodingIndex, kiDid, i /*/ 1000*/); if (pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i] > uiMaxT) { uiMaxT = pCtx->pSliceThreading->pSliceConsumeTime[kiDid][i]; iMaxI = i; } ++ i; } fprintf (pCtx->pSliceThreading->pFSliceDiff, "%6d us consume_time_max coding_idx %d iDid %d pSlice %d\n", uiMaxT, pCtx->iCodingIndex, kiDid, iMaxI /*/ 1000*/); } } ++ iSpatialIdx; } } #endif//#if defined(MT_DEBUG) #endif//MT_ENABLED } #endif//MT_ENABLED