shithub: openh264

Download patch

ref: 4bc0b8ad188cf373e3224e9471963c116816fea6
parent: c66c305a634c9f63e12d4b3846645b3d7dc81b00
parent: ec09d67a5fd82c7c2e391a7490f6271dc5570c96
author: huili2 <[email protected]>
date: Thu Oct 31 10:06:18 EDT 2019

Merge pull request #3190 from xiaotianshi2/thread_commit_3_updated

commit-3 (updated) of multi-threaded decoding support.

--- a/codec/decoder/core/inc/decoder_context.h
+++ b/codec/decoder/core/inc/decoder_context.h
@@ -277,6 +277,7 @@
   PPicture          pPreviousDecodedPictureInDpb; //pointer to previously decoded picture in DPB for error concealment
   int32_t           iPrevFrameNum;// frame number of previous frame well decoded for non-truncated mode yet
   bool              bLastHasMmco5;
+  uint32_t          uiDecodingTimeStamp; //represent relative decoding time stamps
 } SWelsLastDecPicInfo, *PWelsLastDecPicInfo;
 
 typedef struct tagPictInfo {
@@ -538,6 +539,8 @@
   PPicture pDec;
   SWelsDecEvent sImageReady;
   SWelsDecEvent sSliceDecodeStart;
+  SWelsDecEvent sSliceDecodeFinsh;
+  int32_t       iPicBuffIdx; //picBuff Index
 } SWelsDecoderThreadCTX, *PWelsDecoderThreadCTX;
 
 static inline void ResetActiveSPSForEachLayer (PWelsDecoderContext pCtx) {
--- a/codec/decoder/core/inc/pic_queue.h
+++ b/codec/decoder/core/inc/pic_queue.h
@@ -54,6 +54,8 @@
 
 PPicture PrefetchPic (PPicBuff pPicBuff);  // To get current node applicable
 PPicture PrefetchPicForThread (PPicBuff pPicBuff); // To get current node applicable in the case of threaded mode
+PPicture PrefetchLastPicForThread (PPicBuff pPicBuff,
+                                   const int32_t& iLast); // To get last node applicable in the case of threaded mode
 
 } // namespace WelsDec
 
--- a/codec/decoder/core/inc/picture.h
+++ b/codec/decoder/core/inc/picture.h
@@ -89,11 +89,13 @@
   uint32_t    uiDecodingTimeStamp; //represent relative decoding time stamps
   int32_t     iPicBuffIdx;
   EWelsSliceType  eSliceType;
+  bool        bIsUngroupedMultiSlice; //multi-slice picture with each each slice group contains one slice.
   bool bNewSeqBegin;
   int32_t iMbEcedNum;
   int32_t iMbEcedPropNum;
   int32_t iMbNum;
 
+  bool*    pMbCorrectlyDecodedFlag;
   uint32_t*  pMbType; // mb type used for direct mode
   int16_t (*pMv[LIST_A])[MB_BLOCK4x4_NUM][MV_A]; // used for direct mode
   int8_t (*pRefIndex[LIST_A])[MB_BLOCK4x4_NUM]; //used for direct mode
--- a/codec/decoder/core/src/decoder.cpp
+++ b/codec/decoder/core/src/decoder.cpp
@@ -393,6 +393,7 @@
   sLastDecPicInfo.pPreviousDecodedPictureInDpb = NULL;
   sLastDecPicInfo.iPrevFrameNum = -1;
   sLastDecPicInfo.bLastHasMmco5 = false;
+  sLastDecPicInfo.uiDecodingTimeStamp = 0;
 }
 
 /*!
@@ -437,6 +438,9 @@
     iNumRefFrames = MAX_REF_PIC_COUNT + 2;
   } else {
     iNumRefFrames = pCtx->pSps->iNumRefFrames + 2;
+    if (pCtx->pThreadCtx != NULL) {
+      iNumRefFrames = MAX_REF_PIC_COUNT + 1;
+    }
   }
 
 #ifdef LONG_TERM_REF
@@ -478,7 +482,9 @@
                          && kiPicHeight == pCtx->iImgHeightInPixel) && (!bNeedChangePicQueue)) // have same scaled buffer
 
   // sync update pRefList
-  WelsResetRefPic (pCtx); // added to sync update ref list due to pictures are free
+  if (pCtx->pThreadCtx == NULL) {
+    WelsResetRefPic (pCtx); // added to sync update ref list due to pictures are free
+  }
 
   if (pCtx->bHaveGotMemory && (kiPicWidth == pCtx->iImgWidthInPixel && kiPicHeight == pCtx->iImgHeightInPixel)
       && pCtx->pPicBuff != NULL && pCtx->pPicBuff->iCapacity != iPicQueueSize) {
@@ -554,6 +560,17 @@
   if (NULL != pPicBuff && NULL != *pPicBuff) {
     DestroyPicBuff (pCtx, pPicBuff, pMa);
   }
+  if (pCtx->pThreadCtx != NULL) {
+    //prevent from double destruction of PPicBuff
+    PWelsDecoderThreadCTX pThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pThreadCtx);
+    int32_t threadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
+    int32_t  id = pThreadCtx->sThreadInfo.uiThrNum;
+    for (int32_t i = 0; i < threadCount; ++i) {
+      if (pThreadCtx[i - id].pCtx != NULL) {
+        pThreadCtx[i - id].pCtx->pPicBuff = NULL;
+      }
+    }
+  }
 
   if (pCtx->pTempDec) {
     FreePicture (pCtx->pTempDec, pCtx->pMemAlign);
@@ -796,7 +813,11 @@
             }
             CheckAndFinishLastPic (pCtx, ppDst, pDstBufInfo);
             if (pCtx->bAuReadyFlag && pCtx->pAccessUnitList->uiAvailUnitsNum != 0) {
-              ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+              if (pCtx->pThreadCtx == NULL) {
+                ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+              } else {
+                pCtx->pAccessUnitList->uiAvailUnitsNum = 1;
+              }
             }
           }
           DecodeFinishUpdate (pCtx);
@@ -852,9 +873,15 @@
       if (IS_PARAM_SETS_NALS (pCtx->sCurNalHead.eNalUnitType)) {
         iRet = ParseNonVclNal (pCtx, pNalPayload, iDstIdx - iConsumedBytes, pSrcNal - 3, iSrcIdx + 3);
       }
-      CheckAndFinishLastPic (pCtx, ppDst, pDstBufInfo);
+      if (pCtx->pThreadCtx == NULL) {
+        CheckAndFinishLastPic (pCtx, ppDst, pDstBufInfo);
+      }
       if (pCtx->bAuReadyFlag && pCtx->pAccessUnitList->uiAvailUnitsNum != 0) {
-        ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+        if (pCtx->pThreadCtx == NULL) {
+          ConstructAccessUnit (pCtx, ppDst, pDstBufInfo);
+        } else {
+          pCtx->pAccessUnitList->uiAvailUnitsNum = 1;
+        }
       }
     }
     DecodeFinishUpdate (pCtx);
--- a/codec/decoder/core/src/decoder_core.cpp
+++ b/codec/decoder/core/src/decoder_core.cpp
@@ -194,8 +194,9 @@
              "DecodeFrameConstruction(): iTotalNumMbRec:%d, total_num_mb_sps:%d, cur_layer_mb_width:%d, cur_layer_mb_height:%d ",
              pCtx->iTotalNumMbRec, kiTotalNumMbInCurLayer, pCurDq->iMbWidth, pCurDq->iMbHeight);
     bFrameCompleteFlag = false; //return later after output buffer is done
-    if (pCtx->bInstantDecFlag) //no-delay decoding, wait for new slice
+    if (pCtx->bInstantDecFlag) { //no-delay decoding, wait for new slice
       return ERR_INFO_MB_NUM_INADEQUATE;
+    }
   } else if (pCurDq->sLayerInfo.sNalHeaderExt.bIdrFlag
              && (pCtx->iErrorCode == dsErrorFree)) { //complete non-ECed IDR frame done
     pCtx->pDec->bIsComplete = true;
@@ -220,9 +221,26 @@
   ppDst[1] = ppDst[1] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
   ppDst[2] = ppDst[2] + pCtx->sFrameCrop.iTopOffset  * pPic->iLinesize[1] + pCtx->sFrameCrop.iLeftOffset;
   pDstInfo->iBufferStatus = 1;
-
-  bool bOutResChange = (pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
-                       || (pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
+  if (pCtx->pThreadCtx != NULL && pPic->bIsComplete == false) {
+    pPic->bIsComplete = true;
+  }
+  if (pCtx->pThreadCtx != NULL) {
+    uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
+    for (uint32_t i = 0; i < uiMbHeight; ++i) {
+      SET_EVENT (&pCtx->pDec->pReadyEvent[i]);
+    }
+  }
+  bool bOutResChange = false;
+  if (pCtx->pThreadCtx == NULL || pCtx->pLastThreadCtx == NULL) {
+    bOutResChange = (pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
+                    || (pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
+  } else {
+    if (pCtx->pLastThreadCtx != NULL) {
+      PWelsDecoderThreadCTX pLastThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pLastThreadCtx);
+      bOutResChange = (pLastThreadCtx->pCtx->iLastImgWidthInPixel != pDstInfo->UsrData.sSystemBuffer.iWidth)
+                      || (pLastThreadCtx->pCtx->iLastImgHeightInPixel != pDstInfo->UsrData.sSystemBuffer.iHeight);
+    }
+  }
   pCtx->iLastImgWidthInPixel = pDstInfo->UsrData.sSystemBuffer.iWidth;
   pCtx->iLastImgHeightInPixel = pDstInfo->UsrData.sSystemBuffer.iHeight;
   if (pCtx->pParam->eEcActiveIdc == ERROR_CON_DISABLE) //no buffer output if EC is disabled and frame incomplete
@@ -846,8 +864,9 @@
  *  Parse slice header of bitstream in avc for storing data structure
  */
 int32_t ParseSliceHeaderSyntaxs (PWelsDecoderContext pCtx, PBitStringAux pBs, const bool kbExtensionFlag) {
-  PNalUnit const kpCurNal               = pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum -
-                                                                                 1];
+  PNalUnit const kpCurNal               =
+    pCtx->pAccessUnitList->pNalUnitsList[pCtx->pAccessUnitList->uiAvailUnitsNum -
+                                                                                1];
 
   PNalUnitHeaderExt pNalHeaderExt       = NULL;
   PSliceHeader pSliceHead               = NULL;
@@ -1462,7 +1481,6 @@
 
 int32_t InitialDqLayersContext (PWelsDecoderContext pCtx, const int32_t kiMaxWidth, const int32_t kiMaxHeight) {
   int32_t i = 0;
-
   WELS_VERIFY_RETURN_IF (ERR_INFO_INVALID_PARAM, (NULL == pCtx || kiMaxWidth <= 0 || kiMaxHeight <= 0))
   pCtx->sMb.iMbWidth  = (kiMaxWidth + 15) >> 4;
   pCtx->sMb.iMbHeight = (kiMaxHeight + 15) >> 4;
@@ -1508,7 +1526,8 @@
         sizeof (
           bool),
         "pCtx->sMb.pNoSubMbPartSizeLessThan8x8Flag[]");
-    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pTransformSize8x8Flag[i] = (bool*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                           bool),
                                          "pCtx->sMb.pTransformSize8x8Flag[]");
     pCtx->sMb.pChromaQp[i] = (int8_t (*)[2])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
                                int8_t) * 2,
@@ -1519,9 +1538,11 @@
                                   int16_t) * MV_A * MB_BLOCK4x4_NUM, "pCtx->sMb.pMvd[][]");
     pCtx->sMb.pCbfDc[i] = (uint16_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (uint16_t),
                           "pCtx->sMb.pCbfDc[]");
-    pCtx->sMb.pNzc[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+    pCtx->sMb.pNzc[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                          int8_t) * 24,
                         "pCtx->sMb.pNzc[]");
-    pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t) * 24,
+    pCtx->sMb.pNzcRs[i] = (int8_t (*)[24])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                            int8_t) * 24,
                           "pCtx->sMb.pNzcRs[]");
     pCtx->sMb.pScaledTCoeff[i] = (int16_t (*)[MB_COEFF_LIST_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth *
                                  pCtx->sMb.iMbHeight *
@@ -1539,20 +1560,24 @@
                                    "pCtx->sMb.pChromaPredMode[]");
     pCtx->sMb.pCbp[i] = (int8_t*)pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
                         "pCtx->sMb.pCbp[]");
-    pCtx->sMb.pSubMbType[i] = (uint32_t (*)[MB_PARTITION_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+    pCtx->sMb.pSubMbType[i] = (uint32_t (*)[MB_PARTITION_SIZE])pMa->WelsMallocz (pCtx->sMb.iMbWidth *
+                              pCtx->sMb.iMbHeight *
                               sizeof (
                                 uint32_t) * MB_PARTITION_SIZE, "pCtx->sMb.pSubMbType[]");
     pCtx->sMb.pSliceIdc[i] = (int32_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int32_t),
                              "pCtx->sMb.pSliceIdc[]"); // using int32_t for slice_idc, 4/21/2010
-    pCtx->sMb.pResidualPredFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (int8_t),
+    pCtx->sMb.pResidualPredFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                       int8_t),
                                      "pCtx->sMb.pResidualPredFlag[]");
-    pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
-        int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
+    pCtx->sMb.pInterPredictionDoneFlag[i] = (int8_t*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight *
+                                            sizeof (
+                                                int8_t), "pCtx->sMb.pInterPredictionDoneFlag[]");
 
     pCtx->sMb.pMbCorrectlyDecodedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
         bool),
                                            "pCtx->sMb.pMbCorrectlyDecodedFlag[]");
-    pCtx->sMb.pMbRefConcealedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (bool),
+    pCtx->sMb.pMbRefConcealedFlag[i] = (bool*) pMa->WelsMallocz (pCtx->sMb.iMbWidth * pCtx->sMb.iMbHeight * sizeof (
+                                         bool),
                                        "pCtx->pMbRefConcealedFlag[]");
 
     // check memory block valid due above allocated..
@@ -1599,6 +1624,8 @@
   return ERR_NONE;
 }
 
+
+
 void UninitialDqLayersContext (PWelsDecoderContext pCtx) {
   int32_t i = 0;
   CMemoryAlign* pMa = pCtx->pMemAlign;
@@ -2307,39 +2334,18 @@
  *  0 - success; otherwise returned error_no defined in error_no.h
  */
 int32_t ConstructAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo* pDstInfo) {
-  int32_t iErr;
-  PAccessUnit pCurAu = pCtx->pAccessUnitList;
-  pCtx->bAuReadyFlag = false;
-  pCtx->pLastDecPicInfo->bLastHasMmco5 = false;
-  bool bTmpNewSeqBegin = CheckNewSeqBeginAndUpdateActiveLayerSps (pCtx);
-  pCtx->bNewSeqBegin = pCtx->bNewSeqBegin || bTmpNewSeqBegin;
-  iErr = WelsDecodeAccessUnitStart (pCtx);
-  GetVclNalTemporalId (pCtx);
-
-  if (ERR_NONE != iErr) {
-    ForceResetCurrentAccessUnit (pCtx->pAccessUnitList);
-    if (!pCtx->pParam->bParseOnly)
-      pDstInfo->iBufferStatus = 0;
-    pCtx->bNewSeqBegin = pCtx->bNewSeqBegin || pCtx->bNextNewSeqBegin;
-    pCtx->bNextNewSeqBegin = false; // reset it
-    if (pCtx->bNewSeqBegin)
-      ResetActiveSPSForEachLayer (pCtx);
-    return iErr;
-  }
-
-  pCtx->pSps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pSps;
-  pCtx->pPps = pCurAu->pNalUnitsList[pCurAu->uiStartPos]->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader.pPps;
-
-  //try to allocate or relocate DPB memory only when new sequence is coming.
-  if (pCtx->bNewSeqBegin) {
-    WelsResetRefPic (pCtx); //clear ref pPic when IDR NAL
-    iErr = SyncPictureResolutionExt (pCtx, pCtx->pSps->iMbWidth, pCtx->pSps->iMbHeight);
-
+  int32_t iErr = ERR_NONE;
+  if (pCtx->pThreadCtx == NULL) {
+    iErr = InitConstructAccessUnit (pCtx, pDstInfo);
     if (ERR_NONE != iErr) {
-      WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING, "sync picture resolution ext failed,  the error is %d", iErr);
       return iErr;
     }
   }
+  if (pCtx->pCabacDecEngine == NULL) {
+    pCtx->pCabacDecEngine = (SWelsCabacDecEngine*)pCtx->pMemAlign->WelsMallocz (sizeof (SWelsCabacDecEngine),
+                            "pCtx->pCabacDecEngine");
+    WELS_VERIFY_RETURN_IF (ERR_INFO_OUT_OF_MEMORY, (NULL == pCtx->pCabacDecEngine))
+  }
 
   iErr = DecodeCurrentAccessUnit (pCtx, ppDst, pDstInfo);
 
@@ -2412,6 +2418,9 @@
 
 int32_t InitRefPicList (PWelsDecoderContext pCtx, const uint8_t kuiNRi, int32_t iPoc) {
   int32_t iRet = ERR_NONE;
+  if (pCtx->pThreadCtx != NULL && pCtx->bNewSeqBegin) {
+    WelsResetRefPic (pCtx);
+  }
   if (pCtx->eSliceType == B_SLICE) {
     iRet = WelsInitBSliceRefList (pCtx, iPoc);
     CreateImplicitWeightTable (pCtx);
@@ -2466,13 +2475,26 @@
  * Decode current access unit when current AU is completed.
  */
 int32_t DecodeCurrentAccessUnit (PWelsDecoderContext pCtx, uint8_t** ppDst, SBufferInfo* pDstInfo) {
-  int32_t iRefCount[LIST_A];
-  PNalUnit pNalCur = NULL;
+  PNalUnit pNalCur = pCtx->pNalCur = NULL;
   PAccessUnit pCurAu = pCtx->pAccessUnitList;
 
   int32_t iIdx = pCurAu->uiStartPos;
   int32_t iEndIdx = pCurAu->uiEndPos;
 
+  //get current thread ctx
+  PWelsDecoderThreadCTX pThreadCtx = NULL;
+  if (pCtx->pThreadCtx != NULL) {
+    pThreadCtx = (PWelsDecoderThreadCTX)pCtx->pThreadCtx;
+  }
+  //get last thread ctx
+  PWelsDecoderThreadCTX pLastThreadCtx = NULL;
+  if (pCtx->pLastThreadCtx != NULL) {
+    pLastThreadCtx = (PWelsDecoderThreadCTX) (pCtx->pLastThreadCtx);
+    if (pLastThreadCtx->pDec == NULL) {
+      pLastThreadCtx->pDec = PrefetchLastPicForThread (pCtx->pPicBuff,
+                             pLastThreadCtx->iPicBuffIdx);
+    }
+  }
   int32_t iPpsId = 0;
   int32_t iRet = ERR_NONE;
 
@@ -2487,7 +2509,7 @@
     true; // Another fresh slice comingup for given dq layer, for multiple slices in case of header parts of slices sometimes loss over error-prone channels, 8/14/2008
 
   //update pCurDqLayer at the starting of AU decoding
-  if (pCtx->bInitialDqLayersMem) {
+  if (pCtx->bInitialDqLayersMem || pCtx->pCurDqLayer == NULL) {
     pCtx->pCurDqLayer = pCtx->pDqLayersList[0];
   }
 
@@ -2500,8 +2522,71 @@
     PSliceHeaderExt pShExt = NULL;
     PSliceHeader pSh = NULL;
 
+    if (pLastThreadCtx != NULL) {
+      pSh = &pNalCur->sNalData.sVclNal.sSliceHeaderExt.sSliceHeader;
+      if (pSh->iFirstMbInSlice == 0) {
+        if (pLastThreadCtx->pCtx->pDec != NULL && pLastThreadCtx->pCtx->pDec->bIsUngroupedMultiSlice) {
+          WAIT_EVENT (&pLastThreadCtx->sSliceDecodeFinsh, WELS_DEC_THREAD_WAIT_INFINITE);
+        }
+        pCtx->pDec = NULL;
+        pCtx->iTotalNumMbRec = 0;
+      } else if (pLastThreadCtx->pCtx->pDec != NULL) {
+        if (pSh->iFrameNum == pLastThreadCtx->pCtx->pDec->iFrameNum
+            && pSh->iPicOrderCntLsb == pLastThreadCtx->pCtx->pDec->iFramePoc) {
+          WAIT_EVENT (&pLastThreadCtx->sSliceDecodeFinsh, WELS_DEC_THREAD_WAIT_INFINITE);
+          pCtx->pDec = pLastThreadCtx->pCtx->pDec;
+          pCtx->pDec->bIsUngroupedMultiSlice = true;
+          pCtx->sRefPic = pLastThreadCtx->pCtx->sRefPic;
+          pCtx->iTotalNumMbRec = pLastThreadCtx->pCtx->iTotalNumMbRec;
+        }
+      }
+    }
+    bool isNewFrame = true;
+    if (pThreadCtx != NULL) {
+      isNewFrame = pCtx->pDec == NULL;
+    }
     if (pCtx->pDec == NULL) {
+      if (pLastThreadCtx != NULL) {
+        pLastThreadCtx->pDec->bUsedAsRef = pLastThreadCtx->pCtx->uiNalRefIdc > 0;
+        if (pLastThreadCtx->pDec->bUsedAsRef) {
+          for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
+            uint32_t i = 0;
+            while (i < MAX_DPB_COUNT && pLastThreadCtx->pCtx->sRefPic.pRefList[listIdx][i]) {
+              pLastThreadCtx->pDec->pRefPic[listIdx][i] = pLastThreadCtx->pCtx->sRefPic.pRefList[listIdx][i];
+              pLastThreadCtx->pDec->pRefPic[listIdx][i]->bAvailableFlag = false;
+              ++i;
+            }
+          }
+          pLastThreadCtx->pCtx->sTmpRefPic = pLastThreadCtx->pCtx->sRefPic;
+          WelsMarkAsRef (pLastThreadCtx->pCtx, pLastThreadCtx->pDec);
+          pCtx->sRefPic = pLastThreadCtx->pCtx->sTmpRefPic;
+        } else {
+          pCtx->sRefPic = pLastThreadCtx->pCtx->sRefPic;
+        }
+        //printf ("last uiDecodingTimeStamp = %d\n", pLastThreadCtx->pCtx->uiDecodingTimeStamp);
+        for (int32_t i = 0; i < pCtx->sRefPic.uiRefCount[LIST_0]; ++i) {
+          if (pCtx->sRefPic.pRefList[LIST_0][i] != NULL) {
+            pCtx->sRefPic.pRefList[LIST_0][i]->bAvailableFlag = false;
+          }
+        }
+        for (int32_t i = 0; i < pCtx->sRefPic.uiRefCount[LIST_1]; ++i) {
+          if (pCtx->sRefPic.pRefList[LIST_1][i] != NULL) {
+            pCtx->sRefPic.pRefList[LIST_1][i]->bAvailableFlag = false;
+          }
+        }
+      }
       pCtx->pDec = PrefetchPic (pCtx->pPicBuff);
+      if (pThreadCtx != NULL) {
+        if (pCtx->pDec != NULL) {
+          pCtx->pDec->bAvailableFlag = false;
+          pCtx->pDec->bIsUngroupedMultiSlice = false;
+          pThreadCtx->pDec = pCtx->pDec;
+          uint32_t uiMbHeight = (pCtx->pDec->iHeightInPixel + 15) >> 4;
+          for (uint32_t i = 0; i < uiMbHeight; ++i) {
+            RESET_EVENT (&pCtx->pDec->pReadyEvent[i]);
+          }
+        }
+      }
       if (pCtx->iTotalNumMbRec != 0)
         pCtx->iTotalNumMbRec = 0;
 
@@ -2519,6 +2604,10 @@
     }
     pCtx->pDec->uiTimeStamp = pNalCur->uiTimeStamp;
     pCtx->pDec->uiDecodingTimeStamp = pCtx->uiDecodingTimeStamp;
+    if (pThreadCtx != NULL) {
+      pThreadCtx->iPicBuffIdx = pCtx->pDec->iPicBuffIdx;
+      pCtx->pCurDqLayer->pMbCorrectlyDecodedFlag = pCtx->pDec->pMbCorrectlyDecodedFlag;
+    }
 
     if (pCtx->iTotalNumMbRec == 0) { //Picture start to decode
       for (int32_t i = 0; i < LAYER_NUM_EXCHANGEABLE; ++ i)
@@ -2556,6 +2645,7 @@
       pCtx->pDec->iFramePoc = pSh->iPicOrderCntLsb; // still can not obtain correct, because current do not support POCtype 2
       pCtx->pDec->bIdrFlag = pNalCur->sNalHeaderExt.bIdrFlag;
       pCtx->pDec->eSliceType = pSh->eSliceType;
+
       memcpy (&pLayerInfo.sSliceInLayer.sSliceHeaderExt, pShExt, sizeof (SSliceHeaderExt)); //confirmed_safe_unsafe_usage
       pLayerInfo.sSliceInLayer.bSliceHeaderExtFlag      = pNalCur->sNalData.sVclNal.bSliceHeaderExtFlag;
       pLayerInfo.sSliceInLayer.eSliceType               = pSh->eSliceType;
@@ -2587,11 +2677,9 @@
       bFreshSliceAvailable = (iCurrIdD != iLastIdD
                               || iCurrIdQ != iLastIdQ);        // do not need condition of (first_mb == 0) due multiple slices might be disorder
 
+
       WelsDqLayerDecodeStart (pCtx, pNalCur, pLayerInfo.pSps, pLayerInfo.pPps);
 
-      if (iCurrIdQ == BASE_QUALITY_ID) {
-        ST64 (iRefCount, LD64 (pLayerInfo.sSliceInLayer.sSliceHeaderExt.sSliceHeader.uiRefCount));
-      }
 
       if ((iLastIdD < 0) ||  //case 1: first layer
           (iLastIdD == iCurrIdD)) { //case 2: same uiDId
@@ -2601,13 +2689,23 @@
           const bool kbIdrFlag = dq_cur->sLayerInfo.sNalHeaderExt.bIdrFlag
                                  || (dq_cur->sLayerInfo.sNalHeaderExt.sNalUnitHeader.eNalUnitType == NAL_UNIT_CODED_SLICE_IDR);
           // Subclause 8.2.5.2 Decoding process for gaps in frame_num
+          int32_t iPrevFrameNum = pCtx->pLastDecPicInfo->iPrevFrameNum;
+          if (pLastThreadCtx != NULL) {
+            if (pCtx->bNewSeqBegin) {
+              iPrevFrameNum = 0;
+            } else if (pLastThreadCtx->pDec != NULL) {
+              iPrevFrameNum = pLastThreadCtx->pDec->iFrameNum;
+            } else {
+              iPrevFrameNum = pCtx->bNewSeqBegin ? 0 : pLastThreadCtx->pCtx->iFrameNum;
+            }
+          }
           if (!kbIdrFlag  &&
-              pSh->iFrameNum != pCtx->pLastDecPicInfo->iPrevFrameNum &&
-              pSh->iFrameNum != ((pCtx->pLastDecPicInfo->iPrevFrameNum + 1) & ((1 << dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum) -
+              pSh->iFrameNum != iPrevFrameNum &&
+              pSh->iFrameNum != ((iPrevFrameNum + 1) & ((1 << dq_cur->sLayerInfo.pSps->uiLog2MaxFrameNum) -
                                  1))) {
             WelsLog (& (pCtx->sLogCtx), WELS_LOG_WARNING,
                      "referencing pictures lost due frame gaps exist, prev_frame_num: %d, curr_frame_num: %d",
-                     pCtx->pLastDecPicInfo->iPrevFrameNum,
+                     iPrevFrameNum,
                      pSh->iFrameNum);
 
             bAllRefComplete = false;
@@ -2623,7 +2721,7 @@
           }
         }
 
-        if (iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID) {
+        if (iCurrIdD == kuiDependencyIdMax && iCurrIdQ == BASE_QUALITY_ID && isNewFrame) {
           iRet = InitRefPicList (pCtx, pCtx->uiNalRefIdc, pSh->iPicOrderCntLsb);
           if (iRet) {
             pCtx->bRPLRError = true;
@@ -2643,7 +2741,13 @@
         if (pSh->eSliceType == B_SLICE && !pSh->iDirectSpatialMvPredFlag)
           ComputeColocatedTemporalScaling (pCtx);
 
-        iRet = WelsDecodeSlice (pCtx, bFreshSliceAvailable, pNalCur);
+        if (pThreadCtx != NULL) {
+          memset (&pCtx->lastReadyHeightOffset[0][0], -1, LIST_A * MAX_REF_PIC_COUNT * sizeof (int16_t));
+          SET_EVENT (&pThreadCtx->sSliceDecodeStart);
+          iRet = WelsDecodeAndConstructSlice (pCtx);
+        } else {
+          iRet = WelsDecodeSlice (pCtx, bFreshSliceAvailable, pNalCur);
+        }
 
         //Output good store_base reconstruction when enhancement quality layer occurred error for MGS key picture case
         if (iRet != ERR_NONE) {
@@ -2659,7 +2763,7 @@
           }
         }
 
-        if (bReconstructSlice) {
+        if (pThreadCtx == NULL && bReconstructSlice) {
           if ((iRet = WelsDecodeConstructSlice (pCtx, pNalCur)) != ERR_NONE) {
             pCtx->pDec->bIsComplete = false; // reconstruction error, directly set the flag false
             return iRet;
@@ -2666,10 +2770,12 @@
           }
         }
         if (bAllRefComplete && pCtx->eSliceType != I_SLICE) {
-          if (pCtx->sRefPic.uiRefCount[LIST_0] > 0) {
-            bAllRefComplete &= CheckRefPicturesComplete (pCtx);
-          } else {
-            bAllRefComplete = false;
+          if (pCtx->pThreadCtx == NULL) {
+            if (pCtx->sRefPic.uiRefCount[LIST_0] > 0) {
+              bAllRefComplete &= CheckRefPicturesComplete (pCtx);
+            } else {
+              bAllRefComplete = false;
+            }
           }
         }
       }
@@ -2721,34 +2827,49 @@
         }
       }
 
+      if (pThreadCtx != NULL && pCtx->uiDecodingTimeStamp > 1 && pCtx->pLastDecPicInfo->uiDecodingTimeStamp > 0) {
+        while (pCtx->uiDecodingTimeStamp > pCtx->pLastDecPicInfo->uiDecodingTimeStamp + 1) {
+          WelsSleep (1);
+        }
+      }
+      if (pThreadCtx != NULL) {
+        pCtx->pLastDecPicInfo->uiDecodingTimeStamp = pCtx->uiDecodingTimeStamp;
+      }
       iRet = DecodeFrameConstruction (pCtx, ppDst, pDstInfo);
-      if (iRet)
+      if (iRet) {
+        if (pThreadCtx != NULL) {
+          SET_EVENT (&pThreadCtx->sSliceDecodeFinsh);
+        }
         return iRet;
+      }
 
       pCtx->pLastDecPicInfo->pPreviousDecodedPictureInDpb = pCtx->pDec; //store latest decoded picture for EC
-      pCtx->bUsedAsRef = false;
-      if (pCtx->uiNalRefIdc > 0) {
-        pCtx->bUsedAsRef = true;
-        for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
-          uint32_t i = 0;
-          while (i < MAX_DPB_COUNT && pCtx->sRefPic.pRefList[listIdx][i]) {
-            pCtx->pDec->pRefPic[listIdx][i] = pCtx->sRefPic.pRefList[listIdx][i];
-            ++i;
+      pCtx->bUsedAsRef = pCtx->uiNalRefIdc > 0;
+      if (pCtx->pThreadCtx == NULL) {
+        if (pCtx->bUsedAsRef) {
+          for (int32_t listIdx = LIST_0; listIdx < LIST_A; ++listIdx) {
+            uint32_t i = 0;
+            while (i < MAX_DPB_COUNT && pCtx->sRefPic.pRefList[listIdx][i]) {
+              pCtx->pDec->pRefPic[listIdx][i] = pCtx->sRefPic.pRefList[listIdx][i];
+              ++i;
+            }
           }
-        }
-        iRet = WelsMarkAsRef (pCtx);
-        if (iRet != ERR_NONE) {
-          if (iRet == ERR_INFO_DUPLICATE_FRAME_NUM)
-            pCtx->iErrorCode |= dsBitstreamError;
-          if (pCtx->pParam->eEcActiveIdc == ERROR_CON_DISABLE) {
-            pCtx->pDec = NULL;
-            return iRet;
+          iRet = WelsMarkAsRef (pCtx);
+          if (iRet != ERR_NONE) {
+            if (iRet == ERR_INFO_DUPLICATE_FRAME_NUM)
+              pCtx->iErrorCode |= dsBitstreamError;
+            if (pCtx->pParam->eEcActiveIdc == ERROR_CON_DISABLE) {
+              pCtx->pDec = NULL;
+              return iRet;
+            }
           }
+          if (!pCtx->pParam->bParseOnly)
+            ExpandReferencingPicture (pCtx->pDec->pData, pCtx->pDec->iWidthInPixel, pCtx->pDec->iHeightInPixel,
+                                      pCtx->pDec->iLinesize,
+                                      pCtx->sExpandPicFunc.pfExpandLumaPicture, pCtx->sExpandPicFunc.pfExpandChromaPicture);
         }
-        if (!pCtx->pParam->bParseOnly)
-          ExpandReferencingPicture (pCtx->pDec->pData, pCtx->pDec->iWidthInPixel, pCtx->pDec->iHeightInPixel,
-                                    pCtx->pDec->iLinesize,
-                                    pCtx->sExpandPicFunc.pfExpandLumaPicture, pCtx->sExpandPicFunc.pfExpandChromaPicture);
+      } else {
+        SET_EVENT (&pThreadCtx->sImageReady);
       }
       pCtx->pDec = NULL; //after frame decoding, always set to NULL
     }
@@ -2758,8 +2879,27 @@
       pCtx->pLastDecPicInfo->iPrevFrameNum = pSh->iFrameNum;
     if (pCtx->pLastDecPicInfo->bLastHasMmco5)
       pCtx->pLastDecPicInfo->iPrevFrameNum = 0;
+    if (pThreadCtx != NULL) {
+      int32_t threadCount = pThreadCtx->sThreadInfo.uiThrMaxNum;
+      int32_t  id = pThreadCtx->sThreadInfo.uiThrNum;
+      for (int32_t i = 0; i < threadCount; ++i) {
+        if (pThreadCtx[i - id].pCtx != NULL) {
+          unsigned long long uiTimeStamp = pThreadCtx[i - id].pCtx->uiTimeStamp;
+          if (uiTimeStamp > 0 && pThreadCtx[i - id].pCtx->sSpsPpsCtx.iSeqId > pCtx->sSpsPpsCtx.iSeqId) {
+            CopySpsPps (pThreadCtx[i - id].pCtx, pCtx);
+            if (pCtx->pPicBuff != pThreadCtx[i - id].pCtx->pPicBuff) {
+              pCtx->pPicBuff = pThreadCtx[i - id].pCtx->pPicBuff;
+            }
+            InitialDqLayersContext (pCtx, pCtx->pSps->iMbWidth << 4, pCtx->pSps->iMbHeight << 4);
+            break;
+          }
+        }
+      }
+    }
   }
-
+  if (pThreadCtx != NULL) {
+    SET_EVENT (&pThreadCtx->sSliceDecodeFinsh);
+  }
   return ERR_NONE;
 }
 
@@ -2875,6 +3015,7 @@
     if (iRealMbIdx == -1) //caused by abnormal return of FmoNextMb()
       return false;
   }
+
   return bAllRefComplete;
 }
 } // namespace WelsDec
--- a/codec/decoder/core/src/manage_dec_ref.cpp
+++ b/codec/decoder/core/src/manage_dec_ref.cpp
@@ -150,7 +150,7 @@
           && pCtx->eSliceType != SI_SLICE)) {
     if (pCtx->pParam->eEcActiveIdc !=
         ERROR_CON_DISABLE) { //IDR lost!, recover it for future decoding with data all set to 0
-      PPicture pRef = pCtx->pThreadCtx != NULL ? PrefetchPicForThread (pCtx->pPicBuff) : PrefetchPic (pCtx->pPicBuff);
+      PPicture pRef = PrefetchPic (pCtx->pPicBuff);
       if (pRef != NULL) {
         // IDR lost, set new
         pRef->bIsComplete = false; // Set complete flag to false for lost IDR ref picture
--- a/codec/decoder/core/src/pic_queue.cpp
+++ b/codec/decoder/core/src/pic_queue.cpp
@@ -111,8 +111,8 @@
   uint32_t uiMbWidth = (kiPicWidth + 15) >> 4;
   uint32_t uiMbHeight = (kiPicHeight + 15) >> 4;
   uint32_t uiMbCount = uiMbWidth * uiMbHeight;
-  pPic->pMbType = (uint32_t*)pMa->WelsMallocz (uiMbCount * sizeof (uint32_t),
-                  "pPic->pMbType");
+  pPic->pMbCorrectlyDecodedFlag = (bool*)pMa->WelsMallocz (uiMbCount * sizeof (bool), "pPic->pMbCorrectlyDecodedFlag");
+  pPic->pMbType = (uint32_t*)pMa->WelsMallocz (uiMbCount * sizeof (uint32_t), "pPic->pMbType");
   pPic->pMv[LIST_0] = (int16_t (*)[16][2])pMa->WelsMallocz (uiMbCount * sizeof (
                         int16_t) * MV_A * MB_BLOCK4x4_NUM, "pPic->pMv[]");
   pPic->pMv[LIST_1] = (int16_t (*)[16][2])pMa->WelsMallocz (uiMbCount * sizeof (
@@ -140,6 +140,11 @@
       pPic->pBuffer[0] = NULL;
     }
 
+    if (pPic->pMbCorrectlyDecodedFlag) {
+      pMa->WelsFree (pPic->pMbCorrectlyDecodedFlag, "pPic->pMbCorrectlyDecodedFlag");
+      pPic->pMbCorrectlyDecodedFlag = NULL;
+    }
+
     if (pPic->pMbType) {
       pMa->WelsFree (pPic->pMbType, "pPic->pMbType");
       pPic->pMbType = NULL;
@@ -213,6 +218,18 @@
   pPic->iPicBuffIdx = pPicBuf->iCurrentIdx;
   if (++pPicBuf->iCurrentIdx >= pPicBuf->iCapacity) {
     pPicBuf->iCurrentIdx = 0;
+  }
+  return pPic;
+}
+
+PPicture PrefetchLastPicForThread (PPicBuff pPicBuf, const int32_t& iLastPicBuffIdx) {
+  PPicture pPic = NULL;
+
+  if (pPicBuf->iCapacity == 0) {
+    return NULL;
+  }
+  if (iLastPicBuffIdx >= 0 && iLastPicBuffIdx < pPicBuf->iCapacity) {
+    pPic = pPicBuf->ppPic[iLastPicBuffIdx];
   }
   return pPic;
 }
--- a/codec/decoder/core/src/wels_decoder_thread.cpp
+++ b/codec/decoder/core/src/wels_decoder_thread.cpp
@@ -57,6 +57,12 @@
 #define HW_NCPU_NAME "hw.ncpu"
 #endif
 #endif
+#ifdef ANDROID_NDK
+#include <cpu-features.h>
+#endif
+#ifdef __ANDROID__
+#include <android/api-level.h>
+#endif
 
 #include "wels_decoder_thread.h"
 #include <stdio.h>