shithub: openh264

Download patch

ref: 801da26d1d11b393533b2f262ff397ea9fa4f1bd
parent: d0a81355b0b2677f38c49d1b55adce584f3f5449
author: Martin Storsjö <[email protected]>
date: Mon Mar 3 17:45:23 EST 2014

Use WelsMultipleEventsWaitSingleBlocking with a master event for waiting on finished threads

This allows using the same codepath for both unix and windows
for distributing new slices to code to threads.

This also improves the performance on unix - instead of waiting
for all the current threads to finish their current slice
before handing out a new slice to each of them (where the threads
that finish first will just wait instead of immediately getting
a new slice to work on), we now use the same logic as on windows.

In one setup, it improves the performance of encoding from ~920 fps
to ~950 fps, and in another setup it goes from ~390 fps to ~660 fps.
(These tests were done with the SM_ROWMB_SLICE mode, which
heavily exercises the code for distributing new slices to the
worker threads.)

The extra WelsEventSignal call on windows where it isn't strictly
necessary doesn't incur any measurable slowdown, so it is kept
without any extra ifdefs to keep the code more readable and unified.

--- a/codec/encoder/core/inc/mt_defs.h
+++ b/codec/encoder/core/inc/mt_defs.h
@@ -94,6 +94,7 @@
 char eventNamespace[100];
 WELS_THREAD_HANDLE			pThreadHandles[MAX_THREADS_NUM];// thread handles, [iThreadIdx]
 WELS_EVENT					pSliceCodedEvent[MAX_THREADS_NUM];// events for slice coded state, [iThreadIdx]
+WELS_EVENT					pSliceCodedMasterEvent;	// events for signalling that some event in pSliceCodedEvent has been signalled
 WELS_EVENT					pReadySliceCodingEvent[MAX_THREADS_NUM];	// events for slice coding ready, [iThreadIdx]
 WELS_EVENT					pUpdateMbListEvent[MAX_THREADS_NUM];		// signal to update mb list neighbor for various slices
 WELS_EVENT					pFinUpdateMbListEvent[MAX_THREADS_NUM];	// signal to indicate finish updating mb list
--- a/codec/encoder/core/src/encoder_ext.cpp
+++ b/codec/encoder/core/src/encoder_ext.cpp
@@ -3225,7 +3225,7 @@
             return ENC_RETURN_UNEXPECTED;
           }
 
-          WelsMultipleEventsWaitAllBlocking (iSliceCount, &pCtx->pSliceThreading->pSliceCodedEvent[0]);
+          WelsMultipleEventsWaitAllBlocking (iSliceCount, &pCtx->pSliceThreading->pSliceCodedEvent[0], &pCtx->pSliceThreading->pSliceCodedMasterEvent);
 
 
           // all slices are finished coding here
@@ -3266,12 +3266,12 @@
           while (1) {
             if (iIndexOfSliceToBeCoded >= iSliceCount && iNumThreadsRunning <= 0)
               break;
-#ifdef _WIN32
             WELS_THREAD_ERROR_CODE lwait	= 0;
             int32_t iEventId				= -1;
 
             lwait = WelsMultipleEventsWaitSingleBlocking (iNumThreadsScheduled,
-                    &pCtx->pSliceThreading->pSliceCodedEvent[0]);
+                    &pCtx->pSliceThreading->pSliceCodedEvent[0],
+                    &pCtx->pSliceThreading->pSliceCodedMasterEvent);
             iEventId = (int32_t) (lwait - WELS_THREAD_ERROR_WAIT_OBJECT_0);
             if (iEventId >= 0 && iEventId < iNumThreadsScheduled) {
               if (iIndexOfSliceToBeCoded < iSliceCount) {
@@ -3285,29 +3285,6 @@
                 -- iNumThreadsRunning;
               }
             }
-#else
-            // TODO for pthread platforms
-            // alternate implementation using blocking due non-blocking with timeout mode not support at wels thread lib, tune back if available
-            WelsMultipleEventsWaitAllBlocking (iNumThreadsRunning, &pCtx->pSliceThreading->pSliceCodedEvent[0]);
-            WELS_VERIFY_RETURN_IFNEQ(pCtx->iEncoderError, ENC_RETURN_SUCCESS)
-            if (iIndexOfSliceToBeCoded < iSliceCount) {
-              int32_t iThreadIdx = 0;
-              // pick up succeeding slices for threading if left
-              while (iThreadIdx < iNumThreadsScheduled) {
-                if (iIndexOfSliceToBeCoded >= iSliceCount)
-                  break;
-                pCtx->pSliceThreading->pThreadPEncCtx[iThreadIdx].iSliceIndex = iIndexOfSliceToBeCoded;
-                WelsEventSignal (&pCtx->pSliceThreading->pReadySliceCodingEvent[iThreadIdx]);
-
-                ++ iIndexOfSliceToBeCoded;
-                ++ iThreadIdx;
-              }
-              // update iNumThreadsRunning
-              iNumThreadsRunning		= iThreadIdx;
-            } else {
-              iNumThreadsRunning = 0;
-            }
-#endif//_WIN32
           }//while(1)
 
           // all slices are finished coding here
@@ -3329,7 +3306,7 @@
           return ENC_RETURN_UNEXPECTED;
         }
 
-        WelsMultipleEventsWaitAllBlocking (kiPartitionCnt, &pCtx->pSliceThreading->pSliceCodedEvent[0]);
+        WelsMultipleEventsWaitAllBlocking (kiPartitionCnt, &pCtx->pSliceThreading->pSliceCodedEvent[0], &pCtx->pSliceThreading->pSliceCodedMasterEvent);
         WELS_VERIFY_RETURN_IFNEQ(pCtx->iEncoderError, ENC_RETURN_SUCCESS)
 
         iLayerSize = AppendSliceToFrameBs (pCtx, pLayerBsInfo, kiPartitionCnt);
--- a/codec/encoder/core/src/slice_multi_threading.cpp
+++ b/codec/encoder/core/src/slice_multi_threading.cpp
@@ -351,10 +351,11 @@
 
   MT_TRACE_LOG ((*ppCtx), WELS_LOG_INFO, "encpEncCtx= 0x%p\n", (void*) (*ppCtx));
 
+  char name[SEM_NAME_MAX] = {0};
+  WELS_THREAD_ERROR_CODE err = 0;
+
   iIdx = 0;
   while (iIdx < iThreadNum) {
-    char name[SEM_NAME_MAX] = {0};
-    WELS_THREAD_ERROR_CODE err = 0;
     pSmt->pThreadPEncCtx[iIdx].pWelsPEncCtx	= (void*) (*ppCtx);
     pSmt->pThreadPEncCtx[iIdx].iSliceIndex	= iIdx;
     pSmt->pThreadPEncCtx[iIdx].iThreadIndex	= iIdx;
@@ -386,6 +387,10 @@
     ++ iIdx;
   }
 
+  WelsSnprintf (name, SEM_NAME_MAX, "scm%s", pSmt->eventNamespace);
+  err = WelsEventOpen (&pSmt->pSliceCodedMasterEvent, name);
+  MT_TRACE_LOG ((*ppCtx), WELS_LOG_INFO, "[MT] Open pSliceCodedMasterEvent named(%s) ret%d err%d\n", name, err, errno);
+
   (*ppCtx)->pSliceBs	= (SWelsSliceBs*)pMa->WelsMalloc (sizeof (SWelsSliceBs) * iMaxSliceNum, "pSliceBs");
   WELS_VERIFY_RETURN_PROC_IF (1, (NULL == (*ppCtx)->pSliceBs), FreeMemorySvc (ppCtx))
 
@@ -444,8 +449,8 @@
   if (NULL == pSmt)
     return;
 
+  char ename[SEM_NAME_MAX] = {0};
   while (iIdx < iThreadNum) {
-    char ename[SEM_NAME_MAX] = {0};
     // length of semaphore name should be system constrained at least on mac 10.7
 #ifdef _WIN32
     if (pSmt->pThreadHandles != NULL && pSmt->pThreadHandles[iIdx] != NULL)
@@ -467,6 +472,8 @@
 
     ++ iIdx;
   }
+  WelsSnprintf (ename, SEM_NAME_MAX, "scm%s", pSmt->eventNamespace);
+  WelsEventClose (&pSmt->pSliceCodedMasterEvent, ename);
 
   WelsMutexDestroy (&pSmt->mutexSliceNumUpdate);
   WelsMutexDestroy (&((*ppCtx)->mutexEncoderError));
@@ -864,6 +871,8 @@
 
         WelsEventSignal (
           &pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]);	// mean finished coding current pSlice
+        WelsEventSignal (
+          &pEncPEncCtx->pSliceThreading->pSliceCodedMasterEvent);
       } else {	// for SM_DYN_SLICE parallelization
         SSliceCtx* pSliceCtx			= pCurDq->pSliceEncCtx;
         const int32_t kiPartitionId			= iThreadIdx;
@@ -967,6 +976,7 @@
           break;
 
         WelsEventSignal (&pEncPEncCtx->pSliceThreading->pSliceCodedEvent[iEventIdx]);	// mean finished coding current pSlice
+        WelsEventSignal (&pEncPEncCtx->pSliceThreading->pSliceCodedMasterEvent);
       }
     }
 #ifdef _WIN32